diff --git a/dev/bench/data.js b/dev/bench/data.js index 90374865a933a..819f0de84d499 100644 --- a/dev/bench/data.js +++ b/dev/bench/data.js @@ -1,5 +1,5 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1717746526986, + "lastUpdate": 1717746533807, "repoUrl": "https://github.com/neuralmagic/nm-vllm", "entries": { "bigger_is_better": [ @@ -102395,820 +102395,6 @@ window.BENCHMARK_DATA = { } ], "observation_metrics": [ - { - "commit": { - "author": { - "name": "Andy Linfoot", - "username": "andy-neuma", - "email": "78757007+andy-neuma@users.noreply.github.com" - }, - "committer": { - "name": "GitHub", - "username": "web-flow", - "email": "noreply@github.com" - }, - "id": "df1f1a00d1fb111ef035ac385fafa38b5ed34488", - "message": "switch to GCP based build VM (#201)\n\nSUMMARY:\r\n* switch over to GCP VM's for building stage of \"remote push\"\r\n\r\nNOTE: this is just the start. i'll redo the benchmarking and nightly\r\nworkflows in an upcoming PR.\r\n\r\nTEST PLAN:\r\nruns on remote push\r\n\r\nCo-authored-by: andy-neuma ", - "timestamp": "2024-04-23T14:46:41Z", - "url": "https://github.com/neuralmagic/nm-vllm/commit/df1f1a00d1fb111ef035ac385fafa38b5ed34488" - }, - "date": 1714210777058, - "tool": "customBiggerIsBetter", - "benches": [ - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67721.31853129905, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136923.50532217047, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 427.07219489930145, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 864.3076764405669, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 160.639388213287, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 219.77572066720023, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:35:27 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24568.955616300445, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 42835.75935418953, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 227.82634990016953, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 374.61978443056046, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 39.055520334651355, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 48.596633468447926, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:00:11 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7527.377659600279, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13679.095036860263, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 191.89926309954896, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 331.35296574051563, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12.080104513171033, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14.650709110785206, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:16:47 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 8710.367588801275, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 15965.79030186061, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 275.3981603991636, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 477.1227850806277, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 20.97422668532524, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 38.98037290179124, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:48:41 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 77073.21296540032, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 146163.68240634102, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 3753.1767207005037, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 7267.537872770735, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 255.0737451625434, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 383.5779543276513, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:57:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 23023.83349860065, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 41645.743912039325, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 232.66971739867586, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 366.3132179202874, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 38.46510305387384, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 47.70392333508084, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:36:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 137370.26431799936, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 215869.07510454953, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61967.11279730116, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 67496.72838633043, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 231.63840083851147, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 299.54306943821, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:33:58 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 25435.519305999707, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 43947.11136245994, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 259.40190170113067, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 395.49169726957183, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 82.69133798286272, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 128.12526599542636, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:50:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 111535.84533939959, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 139907.84255920982, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 86109.12421779885, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 110743.0007109996, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 79.65798954703632, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 152.7060449417187, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:54:52 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 168754.36456020072, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 274835.28300063947, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 55568.49748070054, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61393.967672872786, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 310.565785397624, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 368.4110025457586, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 06:03:40 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 20220.198251899634, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 35722.42199180989, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 177.03949560091132, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 298.8550787503116, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 56.43407178889857, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.53072934130422, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:42:15 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 32098.846679599596, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 56772.15222571119, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 333.93383340007864, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 559.5284918699325, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 69.18295931934536, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 115.03590451252023, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:17:40 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 22242.201575200306, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 38416.09949368026, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 405.60888209947734, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 633.2039915198765, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 54.510660354194954, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 116.7819932999009, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:23:33 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 52838.864513999964, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 107615.60263015, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 392.0379830997264, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 591.367701860072, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 136.9365175885084, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 192.056543447482, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:55:16 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 20696.943696499173, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 40126.07286110094, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 255.02403710015642, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 407.3263524987848, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 46.06854495174866, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 61.6320865881175, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:46:32 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 20446.17628759952, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 41362.25675381941, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 249.30104090035456, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 437.84951206882346, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 44.60517419920066, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 68.91420816637299, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:11:07 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 6740.6105944986875, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12953.2546778108, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 220.07543739982793, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 364.8997653608961, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14.575114716810681, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 29.404255282207586, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:22:46 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 108043.01666580034, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 152536.4371675998, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 73827.56902039946, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 78139.84387231978, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 117.39992733990071, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 153.14561219914486, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 04:25:55 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 8047.533504500825, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 14416.730261489747, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 239.05779530032294, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 404.1848056998787, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 13.527288472236323, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 16.885394328748717, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:42:42 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 10284.022520000146, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 17944.387033951258, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 290.2058996996857, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 507.1017831698554, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 24.138703308314387, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 53.33755475741691, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:28:48 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 460924.56688270095, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 503368.02862478956, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 449794.0928350992, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 480406.76205557154, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 82.92364234558158, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 140.85796713146203, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-04-27 05:09:33 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 12425.266776699209, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 25722.491432967527, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 315.02099159879435, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 500.7434328587614, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 28.132986743957666, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - }, - { - "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.2.0\", \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\", \"torch_version\": \"2.2.1+cu121\"}", - "value": 47.44577373437636, - "unit": "ms", - "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.2.0\",\n \"python_version\": \"3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]\",\n \"torch_version\": \"2.2.1+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22502MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-04-27 06:13:28 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" - } - ] - }, { "commit": { "author": { @@ -143094,6 +142280,820 @@ window.BENCHMARK_DATA = { "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.8.17 (default, Jun 7 2023, 12:29:39) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:54:40 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" } ] + }, + { + "commit": { + "author": { + "name": "Derek Kozikowski", + "username": "derekk-nm", + "email": "106621615+derekk-nm@users.noreply.github.com" + }, + "committer": { + "name": "GitHub", + "username": "web-flow", + "email": "noreply@github.com" + }, + "id": "87571b8be8105738d6da87df053d5a32e7fa001e", + "message": "add more models, new num_logprobs (#285)\n\nadding the `microsoft/phi-2`, `google/gemma-1.1-2b-it`, and\r\n`HuggingFaceH4/zephyr-7b-gemma-v0.1` models to\r\ntest_basic_server_correctness.py. this required increasing the number of\r\nlogprobs included in the evaluation to avoid unexpected failure for a\r\nfew prompts with these models. this did not negatively impact the other\r\nmodels.\r\n\r\nran the test locally multiple times. each time we passed, like this:\r\n```\r\n/root/pyvenv/nmv3119a/bin/python3 /root/.local/share/JetBrains/IntelliJIdea2023.3/python/helpers/pycharm/_jb_pytest_runner.py --target test_basic_server_correctness.py::test_models_on_server -- --forked \r\nTesting started at 2:24 PM ...\r\nLaunching pytest with arguments --forked test_basic_server_correctness.py::test_models_on_server --no-header --no-summary -q in /network/derekk/testdev1/nm-vllm/tests/basic_correctness\r\n\r\n============================= test session starts ==============================\r\ncollecting ... collected 7 items\r\nRunning 7 items in this shard: tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None], tests/basic_correctness/test_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None]\r\n\r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-mistralai/Mistral-7B-Instruct-v0.2-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-NousResearch/Llama-2-7b-chat-hf-4096-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-neuralmagic/Llama-2-7b-pruned70-retrained-ultrachat-4096-sparse_w16a16-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-microsoft/phi-2-2048-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-google/gemma-1.1-2b-it-2056-None-None] \r\ntest_basic_server_correctness.py::test_models_on_server[None-5-32-HuggingFaceH4/zephyr-7b-gemma-v0.1-4096-None-None] \r\n\r\n======================== 7 passed in 1332.51s (0:22:12) ========================\r\n```", + "timestamp": "2024-06-06T20:15:52Z", + "url": "https://github.com/neuralmagic/nm-vllm/commit/87571b8be8105738d6da87df053d5a32e7fa001e" + }, + "date": 1717746532845, + "tool": "customBiggerIsBetter", + "benches": [ + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24809.801940600075, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43256.87655343981, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 230.85963800004947, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 381.02006250007156, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39.570137278423545, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.9601528261819, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:49:38 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 143332.51583630027, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 222546.44138056054, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 67864.02396499924, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73827.0890284312, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 239.8795270796644, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 308.6613617035208, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:12:01 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 32526.437060700042, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57773.36524064003, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 330.9250161998762, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 545.5887492000373, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 69.53575750276477, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.19020295559704, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:02:24 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 6915.1410166995465, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13380.373847579493, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 226.40810460006836, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 359.1773655598529, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.139491998436716, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 29.79929538126497, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:02:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 73576.49548339951, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 148813.4775623996, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 425.19164659979657, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 836.7314811297817, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 174.70690829164758, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 229.70171741499104, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:15:43 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 23364.73714039989, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41996.36488448967, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 233.36182179996285, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 350.5909243400219, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 38.84333733572024, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 48.08712341922941, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:20:11 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 466223.5411801002, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 509291.63272095995, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 454586.06324549974, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 486015.42612930026, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.72618933186322, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 171.12997684127475, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:50:30 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 110610.14581850011, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 155921.46428314017, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 76759.02024500008, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81712.44376273022, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 119.14547809132924, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 158.16527273312516, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:10:44 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 22691.409517400287, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 39462.123903539956, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 406.51223129898426, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 626.6219724008079, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.431478742662875, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 117.34781941991966, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:02:13 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 11215.382277100072, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19612.768866099803, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 298.9036618999308, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 517.92870615046, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26.038869194786315, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 57.07034680126974, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:34:46 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7184.081163700379, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13875.893549129794, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 230.8630073996028, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 366.7540930995157, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 15.690801239222802, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 30.531295486664455, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:28:26 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 83522.69804000034, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 167674.00902064957, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 431.77151409990995, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 800.5986656801085, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 203.5041017158037, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 261.0510703761275, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:41:39 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12683.44704460087, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 26111.0034482893, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 314.93118120051804, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 570.3491221899092, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 28.61592294155258, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 52.83555382492158, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\\nmax-model-len - 4096\\nsparsity - sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 04:54:26 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 20670.799412599994, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 41707.29727305004, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 249.81241279986102, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 434.6498393798992, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 44.87800991248256, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 72.94319169829369, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - teknium/OpenHermes-2.5-Mistral-7B\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"tokenizer\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'teknium/OpenHermes-2.5-Mistral-7B', 'tokenizer': 'teknium/OpenHermes-2.5-Mistral-7B', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 02:55:50 UTC\",\n \"model\": \"teknium/OpenHermes-2.5-Mistral-7B\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 8218.9075702001, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14720.760121540156, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 200.3085763999479, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 341.1576650702042, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.05153270980138, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 16.80131226168232, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"tokenizer\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'tokenizer': 'TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:22:10 UTC\",\n \"model\": \"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 25392.581709299528, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 43627.436878489585, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 245.67273059983563, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 403.4982441999091, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 81.75461039469934, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 124.31011228923826, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:30:07 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 20969.66754760013, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 40483.44978941978, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 256.0066148002534, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 405.88605057017014, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 46.767859650149546, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 61.807399191316264, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"300,1\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 300,\n \"1.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:27:17 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 114838.39343790001, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 142907.70232090994, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 88618.44696559978, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 113273.42560282987, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 82.62750102888478, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 197.33145765445408, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - NousResearch/Llama-2-7b-chat-hf\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"tokenizer\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'NousResearch/Llama-2-7b-chat-hf', 'tokenizer': 'NousResearch/Llama-2-7b-chat-hf', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:35:41 UTC\",\n \"model\": \"NousResearch/Llama-2-7b-chat-hf\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 10743.527976099997, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 18773.01793947992, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 292.5919326004078, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 492.5039500004457, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 24.79929943589153, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.07471698160512, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"750,2.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 750,\n \"2.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 04:08:59 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 75642.95190749933, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 141949.51066711123, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 5267.717306099259, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7155.850088419046, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 250.14415547355244, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 359.49732220760734, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"1500,5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 1500,\n \"5.0\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:37:49 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 19982.480930899692, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 35210.715996600185, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 175.0473129985039, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 290.22134690003423, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 55.914559414430734, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 68.25539483251296, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - 2:4 Sparse\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\\nmax-model-len - 4096\\nsparsity - semi_structured_sparse_w16a16\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': '', 'sparsity': 'semi_structured_sparse_w16a16'}\"\n },\n \"date\": \"2024-06-07 05:23:05 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 7825.313705700045, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_request_latency\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 14168.347028499735, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 195.37728429986598, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_ttft_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 335.6715424597767, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p90_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 12.494404729219081, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + }, + { + "name": "{\"name\": \"p99_tpot_ms\", \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\", \"gpu_description\": \"NVIDIA A10G x 1\", \"vllm_version\": \"0.5.0\", \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\", \"torch_version\": \"2.3.0+cu121\"}", + "value": 13.95690672861726, + "unit": "ms", + "extra": "{\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"benchmarking_context\": {\n \"vllm_version\": \"0.5.0\",\n \"python_version\": \"3.9.17 (main, Jun 7 2023, 12:29:40) \\n[GCC 9.4.0]\",\n \"torch_version\": \"2.3.0+cu121\",\n \"torch_cuda_version\": \"12.1\",\n \"cuda_devices\": \"[_CudaDeviceProperties(name='NVIDIA A10G', major=8, minor=6, total_memory=22515MB, multi_processor_count=80)]\",\n \"cuda_device_names\": [\n \"NVIDIA A10G\"\n ]\n },\n \"gpu_description\": \"NVIDIA A10G x 1\",\n \"script_name\": \"benchmark_serving.py\",\n \"script_args\": {\n \"description\": \"VLLM Serving - Dense\\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\\nmax-model-len - 4096\\nsparsity - None\\nbenchmark_serving {\\n \\\"nr-qps-pair_\\\": \\\"150,0.5\\\",\\n \\\"dataset\\\": \\\"sharegpt\\\"\\n}\",\n \"backend\": \"vllm\",\n \"version\": \"N/A\",\n \"base_url\": null,\n \"host\": \"localhost\",\n \"port\": 9000,\n \"endpoint\": \"/generate\",\n \"dataset\": \"sharegpt\",\n \"num_input_tokens\": null,\n \"num_output_tokens\": null,\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"tokenizer\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"best_of\": 1,\n \"use_beam_search\": false,\n \"log_model_io\": false,\n \"seed\": 0,\n \"trust_remote_code\": false,\n \"disable_tqdm\": false,\n \"save_directory\": \"benchmark-results\",\n \"num_prompts_\": null,\n \"request_rate_\": null,\n \"nr_qps_pair_\": [\n 150,\n \"0.5\"\n ],\n \"server_tensor_parallel_size\": 1,\n \"server_args\": \"{'model': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'tokenizer': 'neuralmagic/OpenHermes-2.5-Mistral-7B-marlin', 'max-model-len': 4096, 'host': 'localhost', 'port': 9000, 'tensor-parallel-size': 1, 'disable-log-requests': ''}\"\n },\n \"date\": \"2024-06-07 03:56:56 UTC\",\n \"model\": \"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\",\n \"dataset\": \"sharegpt\"\n}" + } + ] } ] }