This repository has been archived by the owner on Oct 11, 2024. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Benchmark Fix : Remove special tokens from warmup prompts (#140)
Summary: When sampling words at random for prompt generation, we sometimes pick up the `<pad>` token. The Tokenizer doesn't recognize this as a special token and leaves it in the prompt as-is. This causes the backend to fail with, ``` ../aten/src/ATen/native/cuda/Indexing.cu:1292: indexSelectLargeIndex: block: [312,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed ``` like errors. Test: Manual tests Co-authored-by: Varun Sundar Rabindranath <[email protected]>
- Loading branch information
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.588348820181127
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1074.6733920397724
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
815.6257062458031
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.444956624791462
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
837.84436122289
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9.357181470217062
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2404.795637845785
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2410532216344635
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.336918812480256
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
50.300128289141064
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
855.1021809153981
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9176201447493112
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.29061881741046
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.863265594761262
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
632.2245273189641
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24102315373823957
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.33300998597114
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.953425594474742
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4052.2612343366104
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.72637056801243
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3527.6772359791944
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.47487300877166355
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.52793113870153
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.98569576842402
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.676164559869234
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1387.9013927830003
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.681213339908233
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1388.5577341880703
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.395137231417693
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3793.7053997172766
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
28.26064224967858
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3645.622850208537
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.798653120142602
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
526.59706994351
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
417.6208742513501
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.321705971040485
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
865.9108881176315
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.563399556784552
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3068.7830967701852
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.01985662472978
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4138.686224071319
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.98344075208339
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441.71849278541765
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.899962934437946
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1497.5857668241713
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.452635010163813
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
757.4390947388564
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
562.4676921508875
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.22663357119486
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
898.4789078494304
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7503420476020475
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
487.5444661882662
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.957223182699884
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
988.5883650290962
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.525485302888672
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.9332501491074
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2816444428936262
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
982.8186003207312
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
745.6300910402276
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4833370448275371
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.76531442971115
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.82247565971196
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.3294876433149607
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
719.4079039261484
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
534.1577285791694
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.06293448276078
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4226.952755176839
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.421193144143576
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.75510873866483
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.819894044125597
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
886.5862257363277
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.6019208773814375
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1378.2277495933408
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
924.5197683714276
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7813158588215987
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.57106164680783
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.956576908253712
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2317.4970992231056
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9477849011205371
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
279.45753737839414
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.58884875587916
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.382522224493425
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1649.8639445920726
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9169017231344665
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.19722400748066
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.957445381794802
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2317.9031627119775
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.608318852550081
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4011.337945105371
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.64288204887178
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
912.2151076127687
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7279915826616854
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3821.1913722282275
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9163023262684775
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.11930241490208
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.8568159693940944
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3953.236368628947
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.7785887823242
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1675.608270851073
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4661212433487874
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.04539915142344
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.10580063971575
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4188224407518786
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.4469172977442
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.750616698144274
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3533.908491423078
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.0187106207889187
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
132.43238070255944
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.85479106374545
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1680.5614191434543
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.14860431301536
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3129.6592803459985
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.019634441285104
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3603.046051410272
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2571474274019958
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
33.429165562259456
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9641330862447272
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.2778542562119
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
184.02408217153106
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.401778491885712
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3797.1123663373705
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.0033231680683983
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
390.43201184889176
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7714270386381368
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.28551502295778
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.1750898485852126
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1006.1542221181679
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
725.6053998237443
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7895763552030934
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
568.6032883895824
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
400.70762084623505
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.884424181758
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1682.48757181427
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.91118563045864
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3084.5429463291644
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.00575216484712
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4106.950860566572
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.606445588609027
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1379.5828538506546
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1047.1172499383867
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46410926850365064
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.682644036255
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.11147483606545
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9681113064107114
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6682023078579
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.0543371896201
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2986183078190217
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
709.8746299427223
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
460.3304967970685
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
21.2487830232645
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2762.341793024385
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.758402684054488
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
748.5923489270834
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7743462872434089
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
100.66501734164316
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839584259170587
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.12342174306445
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40054567987602
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.510474838666944
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
846.3617290267028
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
62.83274257186766
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1068.1566237217503
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.404672466248662
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
871.3541913862058
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9206528228227049
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3935.4176339637224
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.247649775539845
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
899.1724425928148
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7268441900384888
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3820.015294789451
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6522251053997539
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
538.8522899414613
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
375.55076645736403
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4919022992713242
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02945378938182
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.07703328449541
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17.930761490800965
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2330.9989938041253
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.273292044834916
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1583.2546737837042
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.762650973027906
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3537.001300068172
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3502888071663706
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
435.53754493162813
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.214286368709619
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
683.8306782940291
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.2525311644811
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.9996363593155344
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1413.926592145126
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.809914286853669
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.28885729097698
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.013497110268868
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
911.7546243349528
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.199152766779169
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1013.7795202646508
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
731.2324806712257
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0204763559791292
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4139.956053401236
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.4285199468166505
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
835.7075930861645
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.013270693348698
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3097.711919441982
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.361942724186687
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1382.256029867519
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
997.0732236273057
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.019962619953773
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4138.90340828528
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
63.32357052289655
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2089.677827255586
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.850666894886405
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1540.5866963352325
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.88644107179491
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1545.237339333338
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.728305618774376
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3821.5132592437353
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8094251037472948
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.22526348714834
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9911576491914724
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
258.8504943948914
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.995080625333543
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3095.365400668027
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.866210984562146
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1542.607427993079
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24109237549281223
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.34200881406559
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.132139083557524
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1309.433554188544
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
831.0007172973085
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.046066563105454
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4127.632146873098
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9793879576442905
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.77580393794454
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
211.39109677794366
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4918996548009227
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.0287547500759
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.99767305269283
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.669104817751982
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
866.9836263077576
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.946658160046692
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2312.8594893114314
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9584359186608198
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4012.8351973360195
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.224872863633113
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
687.1000702329355
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
510.72104909555685
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.818247674804624
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
886.3721977246012
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.3683185975745
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
499.2614161587665
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.743279198822125
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2946.3022289957503
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.48929856067735
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.318075531515
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.382283310038899
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3787.111338049955
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4683076829939092
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3555.015375068757
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.9301184082059075
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2305.126162940754
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839560769126361
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.12272913128044
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.41643788389987
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3663158049493953
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1066.751815430414
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
769.4679782742595
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.41681256428251
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1023.3000559394475
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
776.5127959214932
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.056215141415276
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3103.2517532425704
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.7746077595106033
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
360.6990087363784
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250991.94978499963
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
443686.56103570043
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
503251.21682459046
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
221671.07331492135
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
227333.0633495002
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
414566.2403852994
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
473267.7565153697
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.80451507466644
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
114.02205183095377
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
159.0816242634369
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
489.3943232083358
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5106.615565498942
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19739.157375996725
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34926.049917240154
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88.28187313340702
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.85548350145109
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
175.58698710054156
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
287.80102635068005
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.54086126541375
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.996988809154328
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.52436495386738
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.79191198857451
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
674344.2425669996
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1207481.6843793998
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1321142.520273999
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
650921.8909958315
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
657220.7978655001
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1183312.6919875
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1307880.633195389
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
76.6800586424399
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.77981138956453
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.97386126055588
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
217.9227060239928
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2399.4995354996718
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10151.151842101171
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17746.521789919716
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.02681825334972
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
83.57093850008823
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
289.1190933007238
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
473.42077934079475
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.103697460264083
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
16.02233383929389
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.646386778824148
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.35619449473132
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
304688.0397650002
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
537386.8121694977
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
606089.513498347
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
248617.95944837536
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
253041.17153450171
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
479552.45181380084
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
537685.3134576494
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286.05975662203946
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250.86937257436892
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
325.24293372935796
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1386.908563218254
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4080.503848000262
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15645.16164450124
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27766.757156900025
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.75174194004649
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.61830799950985
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
270.73455049885524
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
427.3936426195903
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.37761758580714
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.21081650605851
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.612497715747132
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.36344421992664
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5908.207430999937
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24571.525440900044
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41548.23684629018
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
191.10034866802258
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
178.6385755003721
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
410.1887348000673
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
703.8313160505459
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.702419245494276
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41.52990188104458
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.65912179498673
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117.38638514422769
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160622.82429299923
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
285319.90235570085
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
342111.7490468109
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122282.25634085831
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
124019.2867909991
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245711.5591851016
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
276715.5687667783
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
209.96150285562797
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
178.8372088565525
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
324.5906218522352
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
528.0143898056211
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6022.783367000102
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20390.902737600027
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41212.30877519981
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
129.12288489000275
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
89.44534049987851
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
259.19739910004864
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
423.0251944502923
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.54708939613473
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.355412029708305
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.35917255149317
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.85785688676518
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6097.755366500223
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22960.922855400073
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41522.31149773986
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.63155625998722
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.90840449998359
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
247.25749189974522
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
361.49783694977475
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34.58176563081783
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
35.102934003549805
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.37252347781753
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.74717718787568
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4739.416366499427
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15546.803114901568
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32780.13436661011
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.88455513668063
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
65.97279000197886
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
198.0765673972202
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
325.50387864004716
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.38896417390611
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32.42852326904923
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.702052272659145
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.194071689614
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91249.72328300009
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
172178.43290000074
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
254537.17004031996
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44739.81954690932
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43292.593048000526
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
95966.22706099988
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107481.2860408003
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220.05213010281182
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
229.51166153239603
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
274.7142004041561
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
368.69578960290943
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263626.2562815
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
471471.30772499996
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
515713.1468876797
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
248505.75535987265
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249904.31834650008
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
459632.0596595994
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
493867.9884271999
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
73.3261315677406
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.24909443769761
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87.15264794378773
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
143.46016910748267
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
175528.64354300074
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
312339.5796912
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377669.6726372103
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
123604.25794161699
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120231.11166400032
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
251970.23554729988
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284085.7430519602
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
231.868392688618
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
232.6936919274645
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
265.1473265491053
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
478.8363919391908
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6356.445163499757
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24469.470964800214
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42692.66540432966
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
110.92727053999019
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.1658644999934
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
228.16987929982133
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
382.4169059998301
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.168095374841535
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.847570126010126
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.856006860553336
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.55615377184905
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6034.898172500107
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20691.889469899368
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39826.268077299675
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.7157322633193
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
83.16542550028316
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
248.91378990023446
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
420.1552668894601
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.75253643144614
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.449776833487235
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.03960594809268
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61.1646615524307
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5934.12300899945
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24774.4847191989
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42745.47327609066
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.54862310931883
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
90.36464950077061
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
236.80249869903494
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
398.1611517605778
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53.97618118053659
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.96606512781914
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81.44089599966323
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
134.02277821305321
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1784.3108549996032
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6704.182830700848
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12841.2931336407
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
93.3302357966871
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.97060300078738
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
222.9897587985761
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
369.7863062898607
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.417530160727523
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.647449202271176
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.587758606962433
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.526560337076248
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82133.52750800004
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120052.93040170072
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
149734.2170834503
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61618.23296246
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74617.86009050002
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92600.15133730012
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119873.25422289023
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.7272116284387
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.61095897933342
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
93.12862270299512
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
154.59966078206244
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2037.1767734995956
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8001.745354799642
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14320.202196750768
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.00973562660754
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87.0078089992603
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
257.47253320059826
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
397.51055801973035
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.492027922063027
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.513216470435554
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.337577811624522
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.453693573352512
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13329.08163150023
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55467.72730410066
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
114961.67531211894
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
208.89051205334303
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
179.4312380006886
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
402.3682395001743
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.8117697402085
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.9026019082129
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107.43258363898093
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
140.20052844045725
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
208.28114469749642
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82563.63815399981
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
188455.65055029935
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
298659.5128741401
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30778.437577506633
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23762.649473999772
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74457.5980183001
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82872.44585328952
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
283.7256379752175
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
304.16969475875766
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
333.5635612154232
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.6307447985352
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15317.778024499603
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71089.70565669899
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146099.52737478935
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
207.41730555335258
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
164.48539800057915
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
424.3713370999103
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
724.1277868187899
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.1458163410269
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.8466406935902
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
174.23506908938273
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245.87172407156825
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17420.16532300113
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
75913.48148570069
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
149102.0339663318
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
307.73280690802005
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
166.88944949964934
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
519.3801383979012
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3066.9985082199855
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
162.9742524033701
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
151.71022305122415
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263.810227848405
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.6177162493721
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3894.2915755005743
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13193.522361901345
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27306.193986870177
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146.28036964666535
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.84006300055626
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
319.59140130074974
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
501.84458358038586
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.246840665780955
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.459467525782035
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.151223350638805
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.11489276840722
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1907.4940279997463
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7468.912448099218
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13564.402814809537
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
78.91185938668059
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.863186999355094
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
189.3444290994011
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
334.63867685004465
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.86678077713794
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.940579359339255
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.978265181169906
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.618961259595729
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7898.260258499931
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32147.7272599001
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56476.37390104015
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
158.08667782533666
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
133.45391550001295
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
316.0129653999776
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
530.5085999998073
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
58.543013513868516
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.56845384198476
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.36686659853399
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.9998238649914
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2087.4864259994865
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8633.818766600962
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15777.274397449848
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
138.2953844133893
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
104.73672349962726
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
293.1157074983276
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
477.73680296118937
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.239197941691748
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.534839750792916
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20.930495877280816
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42.96149953787926
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67134.2510999998
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116244.80816140007
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
162896.61438532016
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43763.49512054134
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43019.2457359999
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81707.42702230005
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86768.82677673998
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.04788062821095
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
108.34163438205232
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.47435938910468
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
180.6259211311704
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288207.20673849975
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
502154.15729360026
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
586848.0845844796
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
217920.56004375766
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
214136.35563300067
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
430196.0109535006
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
483399.4259213496
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
334.20420609587546
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
309.3883411846394
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
356.5368193883329
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1333.6087864704973
msThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.784569815912074
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
751.9940760685696
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.323003833105883
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
901.6591264924941
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24114256373510773
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.348533285564002
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.465715185528444
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3552.358065166655
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.921493944638658
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3937.1410925646105
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0164365388353063
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4131.678468073543
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
64.38536028887411
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1094.55112491086
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.597512581645871
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
857.6766356139632
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.315812225253204
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1645.5277946414583
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.217959010956969
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1019.739030982154
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
735.363702571872
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.0197924750053766
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
132.57302175069898
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.83623606125992
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
888.7106879637896
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3551005336061777
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
436.1630693688031
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.473041762577807
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
501.0417099638227
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9680816728669731
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6593662765645
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.04790750085672
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
50.45520941728765
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
857.73856009389
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.9647578311668425
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2321.322171540369
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.718963188165938
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3811.9372678700865
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
49.162695384406774
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3195.57519998644
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
21.459987007742917
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2789.7983110065793
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.70186731999985
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1391.2427515999807
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.0062157290514033
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
390.80804477668244
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.961108014405339
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2319.61566321536
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.917484239453379
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.27295112893927
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7730140562204904
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.49182730866374
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.632748276258297
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3101.2077840473294
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.934408326990965
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1551.4730825088254
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.118671596999054
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4164.878529260514
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3818827623793077
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1071.6848285703788
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
773.037525590187
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
63.36836844767164
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2091.1561587731644
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.723244684587158
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3816.325801701837
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.917358488844409
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.25660354977316
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.6006505342287083
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1078.357628279311
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
818.4218653459617
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.9042194104670638
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1499.2202536193524
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.516400697560286
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.7788118585249
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9814735001868913
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4060.0392018829402
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.045581644072303
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
442.77488794922914
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3055683421906727
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
989.9835609465705
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
751.0658960219412
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4919001726361827
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02889163464852
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.98468414480482
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.00342260977782
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1415.7113018805317
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.25735702618953754
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
33.45641340463988
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9638067557997434
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.18163463674034
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
183.96179547949703
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.690370255422972
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
869.7481332049864
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.003494618396067
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4103.581983855968
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.92606081412479
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3086.461845022098
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2172419970505497
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.7434551424644
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.6412267687216
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.739536458471113
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1396.1397396012449
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46409055865682275
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.67769827534453
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.1068342951229
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9922421453341965
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
258.99147889344556
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.945830893651499
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1552.9580161746949
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7210294092290557
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3814.0551444597822
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.016582425861571
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4131.9773905903585
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.062271691963587
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4225.594696833389
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.86730018154673
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1681.3745118005374
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.638366703802338
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1389.1428979995276
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
932.412205409519
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.908946796388262
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1548.163083530474
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.794862027302289
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
570.2827085094412
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.8971291307345
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24123494572065513
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.360542943685168
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.212828536495898
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
937.6677097444668
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4918913323688169
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02655479837304
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.03823956070347
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.780472089991158
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
361.4613716988506
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.44320030756627
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
837.6160399836151
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.395002260023018
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1392.7322661786943
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1004.5891065855147
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.839353417022581
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
889.1159442129355
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.902000918111366
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1683.630059677239
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.00523580843273
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4106.685969725991
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4833637087315282
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.77236276609216
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.82908581927072
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.810844285779236
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.40975715130068
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.660176553235297
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
541.4455539346844
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.35148984417003
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.66158816810069
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
879.8324095473228
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.397435500081341
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3794.8844115417282
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.4535231486675833
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
757.7133755925142
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
563.4303272242221
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9163670857631944
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.12772114921528
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.075330360333155
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3105.7176164829766
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8110142395317264
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.43185113912443
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.330363683696408
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
866.4736394402665
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7560606300714396
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
488.28788190928714
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.77132803439447
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3539.2313048393785
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.792590455304182
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
917.155485025038
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.660127601090121
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1395.6600618746133
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1058.6241000051054
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9.35732620386322
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2404.832834392848
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4297529100733914
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1027.17555578691
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.4216380729084
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.232623026410117
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
898.6765598715339
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.770335101143488
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3538.976120993876
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46624454107871827
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.07874050156647
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.13704181852563
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.735744807133127
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2942.437086059294
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.525443885590882
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
848.3077051268147
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.592992023036373
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
452.0808643916184
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.10991091091508
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3110.1785075080456
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.47481759945898366
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.51328424098773
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.9741458818936
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839693649062745
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.12664714049805
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40946680231747
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24112074600951047
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.34569698123636
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.140379649010792
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1312.0449069750298
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
832.6579499470636
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0152404006235547
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4129.227580877663
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.220164078749874
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1020.4377949150477
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
736.0500776882787
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.081683196963258
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2350.6188156052235
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.385248107387498
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3788.632279089786
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7758890569963299
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
100.86557740952288
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.881584479387114
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
634.6059823203249
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.331633683118509
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
720.070658245212
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
534.5410116570168
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.948041643913577
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
988.285374249148
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.856915515716902
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3953.3384036098246
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.771279354593599
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4053.2187941305547
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4237601741793604
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
445.08882264331686
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.088298183259893
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3620.6926330977926
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.3002995063744702
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
710.3938288886063
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
461.0720330576988
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.422482286697215
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.9226972706379
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7826053446726924
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.73869480745
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9794810074272519
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.8032399766166
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
211.41118064309802
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9478788412986664
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
279.4852359530495
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.61544587500546
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.394276924171252
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3793.2640620998523
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.828573536732577
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1678.8572798876176
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.29100981449503
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1585.540266069859
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.092438446186236
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3107.9245595580246
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.750860047807334
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3533.971032286485
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.225873196546625
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
687.4089997121725
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
510.9566118445997
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.966974335001433
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2322.35852007327
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
28.739237801203036
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3707.361676355192
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.983903326066122
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1071753683496
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.39521099789377
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.941645143964446
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2310.5156035120162
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87078.85492650076
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
166687.3078349993
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250348.21315922923
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41932.37467958464
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39771.04279000014
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
90889.26047640071
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101551.70637641952
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216.72238039185132
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
225.37556464792502
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
267.6652596057986
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
370.97431710261196
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6033.084445499753
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20688.617769400687
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39748.52140693957
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.40266319334296
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86.87733149963606
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
258.09761329983303
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.48201770019836
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.610865000806
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.27937967799597
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.1107847520758
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61.326133140665846
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
65972.68179499997
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
114331.85890740014
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161469.9170489398
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42928.57365180666
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42243.098430000144
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
79235.44589519968
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
85010.27963773998
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
105.56548218927999
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107.8600926031914
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.20116550853594
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
178.3685630039288
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249557.0056020001
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441013.6021780003
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
500414.5756039498
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220162.99745686134
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
225700.40752800013
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
411909.09268700005
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470507.78565972
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.27048751429459
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
113.42975546353826
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
158.23698375808266
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
488.6010758987094
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
301458.02289150015
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531471.1774119997
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
599351.2158671491
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245749.70808692768
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250456.82895400023
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
473812.5403829004
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531286.8479474895
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.07809454138146
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249.16390695129394
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
322.1764053937818
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1379.2303533035486
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2062.6665575000516
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7995.271003500238
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14336.587134110458
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.68523165343989
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
89.01475200036657
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.34268360090198
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
406.518968199889
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.560103798031665
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.542171674976787
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.423777259298578
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
16.852707885504962
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4747.03899600172
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15591.889702197668
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32783.61887073013
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.45634529009355
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.206363499281
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
195.93438419942686
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
323.60718279895065
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.38904949005257
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32.43962575603367
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.78818411988907
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.92573880013559
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12366.243548000057
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53024.79005149999
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107782.19263596053
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.45467832401968
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
175.3598969999075
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.33624830050394
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
654.4088599205496
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
96.8605043294034
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
102.63104027381772
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
138.41955664140664
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
200.2479470876339
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6368.846771000108
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24482.642059899947
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42715.73665262999
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
110.3944119599752
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71.354152999902
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
226.62421669997457
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
372.1986422897674
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.197558297896094
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.93875414049114
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.91372388756703
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.58186433505949
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
157786.35849450075
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
280817.63101449906
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
338272.11605071987
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120085.57282628964
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121816.13273949915
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
241964.11697160147
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
272373.0947019997
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
207.69618124235134
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
176.8209020344984
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
321.02680363619044
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
509.39933619754305
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263643.4612390003
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470307.5604824006
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
513597.6942795102
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
247762.879030188
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249604.9617650001
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
458442.0703108001
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
491436.03123178927
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.98175988130473
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.95224382587031
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87.19808111865594
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
162.9864043967971
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1907.4795054998503
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7444.930742199994
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13549.884848169995
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
78.8402330133431
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.87884199982727
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
192.21433880020408
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
333.27116598989005
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.838697982902397
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.926785527538577
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.950563849344958
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.476707400044512
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14720.950498999628
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67933.90517889985
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
137971.71309427067
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
200.73888841600032
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161.15600249941053
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
415.2144525010954
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
698.971603070604
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.26388369429124
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
124.74668930059234
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
163.5406885047138
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
227.4723519773243
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4067.9405504997703
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15670.68034670101
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27716.811577249686
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.98081228005321
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71.5323755002828
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
273.5991383991859
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
428.55572244005066
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.383164225797817
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.18772708808538
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.646255505107547
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.811073828196534
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80772.74030499984
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117672.73102710006
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
147175.68437624982
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
60256.95331741866
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
73386.89722849995
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
89312.30813659976
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118381.82351395037
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
73.00924180195457
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.20645001825224
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.46917257997211
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161.09012643610353
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2383.1894610002564
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9940.289564999877
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17717.63242380004
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.03213306930895
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81.68335549908079
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.53421659991733
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
498.98087476909495
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17.950893348628018
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.829260380697253
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.335178154805295
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56.308800943161735
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
170671.05902250024
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
305356.94843809976
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
370297.89290799934
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120108.89220509402
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116392.30294150002
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245431.203282
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
277111.63447574107
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
229.2131987045148
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
229.90028315687096
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
261.8559960768548
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
460.1263772865521
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286770.2370344996
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
499761.2010583992
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
583936.543034599
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216693.07293697304
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
213147.30604800023
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
428144.38565260096
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
480797.84968450904
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
333.15006460101984
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
307.9796054774597
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
355.0555979125649
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1332.8271837982684
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6115.9639080005945
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22951.477896800312
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41471.33624731973
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
105.23820117326977
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.99927150041185
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
228.39327949959622
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
356.76908284036466
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34.55231061468231
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
35.08082264466393
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.32287406296286
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.665914421756405
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5113.15667999952
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19727.629315499686
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34960.44694636082
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88.36005334658694
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.225850999609975
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
171.8838621998657
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
283.05705117909355
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.554409991117375
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.99316103713061
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.63831760896856
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.79407050121553
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2090.1327334995585
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8407.996431600444
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15702.54364228965
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
137.9247076600162
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
104.50966550070007
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.7890258000407
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
477.63374476942727
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.202480147000642
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.513466976809715
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20.76668215074865
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42.632946854503764
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17308.257391998268
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
75437.07538530149
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
147811.8180262587
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
276.1588823493512
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
164.01686199969845
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
445.0943145991915
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2629.3781739585756
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161.7103280564558
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
149.45230953046322
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.77177963161796
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.1719115048861
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80535.84290950038
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
186866.39935110108
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
295680.31355927954
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29822.519369631988
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22943.38446849997
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71735.21170529893
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
79826.1835862601
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
281.6347227080006
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
302.07970776098017
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
331.1313267480902
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
436.3305208649852
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5892.267117000301
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24405.772280699606
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41358.992714729844
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
190.25046913197002
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.92522899981122
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
403.25762340053194
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
689.0308930098579
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.26222015969986
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.89595480544346
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.721781491863126
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.6804189861595
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5903.951419499208
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24516.503485498484
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42291.664340930096
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.21873510531441
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
90.06597550069273
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
243.68274609878426
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
396.27316531848913
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53.58022328291474
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.65217896928982
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80.07030071814562
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
133.5028109654013
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3896.173127000111
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13228.43184610065
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27273.774564279913
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146.26822445005018
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.76872749939503
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
321.7394509001679
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
496.7795555901275
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.237361002839393
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.437152380963937
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.131731099280564
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.18662507139258
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6004.044217499768
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20335.66945980001
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41146.307323220026
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
128.296063536662
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
85.64261450010235
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
257.0049988998562
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
423.0401618596303
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.43827075005918
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.21481352271153
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.27986503406668
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68.97381901159451
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7865.74364650005
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32108.276678100174
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56407.86311968988
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
157.40392523201325
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.69729000003827
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
323.76926780016225
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
545.7288574203219
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
58.17538868703537
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.23315985311779
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.6159418868831
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.3505454250203
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1797.2854459994778
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6670.8323046996165
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12818.430891070182
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.86064251669207
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.29699199987226
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
219.008817100621
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
373.176763839437
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.379933161881002
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.632780913835274
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.361142560551347
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.51887915249016
msThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.838116658653091
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
888.9551656249018
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
21.455208803018568
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2789.177144392414
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.017761726933778
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4134.3937784873115
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.738139975833118
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
915.3586192024928
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.453199176214489
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
757.6133242597332
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
563.2643436555516
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.653996353156336
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
539.4299599614657
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
375.93793377313904
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9818363312851008
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4060.7826428031713
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4749120760525874
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.53825818374096
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.99383921290834
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.76771755394166
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4052.3034113630065
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9679555520115684
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6217599803028
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.01731611994998
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9931523025292543
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
259.1097993288031
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839794718276083
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.12962719994306
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.4182084465713
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.88120990504532
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1682.2786438279456
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9796501255428988
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.8531050167422
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
211.44768309717927
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9174208430597062
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.2647095977618
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2946205484270683
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
708.6400152355701
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
459.6216743321355
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.996749097517547
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4102.332287026502
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.77997840196837
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1675.698596127944
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.621167989345824
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
860.751838614957
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.241057632997945
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.33749228973285
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.017400983249153
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4133.6546146775145
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.523472487983527
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
848.0514234378586
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.964289663985174
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.3240217249085
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
184.05396816485015
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.191753328910572
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
496.2598065914797
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.144728532941644
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1313.4230248038778
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
833.5325394981177
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.014245825264574
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3097.8377114591303
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
50.43186572091188
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
857.341717255502
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0171598375299338
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4133.160507098834
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2101230970192365
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1017.2559082144257
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
733.5987309514827
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9165469308852165
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.15110101507814
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.713580306329598
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1392.7654398228476
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.943313313572866
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2311.2955728941292
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3526564512108386
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
435.845338657409
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.735040235076589
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2942.0756405942902
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.22254879064374
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
938.9313427836861
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4831195446303864
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.70782042759635
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.768555910837
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8002575561631242
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.0668054905464
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
417.99340076358703
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.392211867306257
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3792.2046879281097
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.3969289179504525
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3794.6245349085825
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7822145383585009
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.68788998660511
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4253683575679075
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
445.297886483828
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
62.71983301589839
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1066.2371612702727
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9.352934265142828
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2403.7041061417067
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.7373162480129
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1395.8511122416771
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2571656396598578
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
33.431533155781516
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3038277111880623
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
989.4622599478091
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
750.6704031903657
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.98188698430719
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3183.822653979967
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7239146713050246
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3817.0125380876502
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.772221855565715
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.38884122354295
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.7795981868183812
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
361.3477642863896
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.81033572153856
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.3436438000128
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.854678762423767
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1680.5541195575447
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.944890547186002
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3088.890880586994
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
28.655397893315374
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3696.546328237683
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.12000589170112
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4165.563022442675
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.386701548419935
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1390.101853678793
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1002.7385601471192
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2259245160114984
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
687.4248485314442
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
510.96245660011675
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.933880368636133
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1551.4044479226973
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.002289627695838
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3096.2953619727627
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.240582889559622
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
898.9392353554675
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.965110050846735
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2321.4868553738997
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.940790982664408
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1552.302827746373
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4918938964530053
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02723258838742
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.99952419827437
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.904656415913398
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1499.388063710745
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.063290354579866
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4227.681936534146
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.065323525804521
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3614.788146131762
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.390200993934826
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3791.1731098885657
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2410637549137487
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.33828813878733
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.313858941865476
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1645.4008312212559
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.631886090493913
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3100.804660471334
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9208832190348346
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3935.889715802376
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.467213819780543
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3553.894165275057
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.3309014232261815
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
719.8445168635309
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
534.3669209460128
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.983926972189034
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1141475065106
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40359524302073
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24091743626280446
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.31926671416458
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.008349241622078
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
391.08540141087013
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.466205007519746
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.06805010010623
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.12702480535323
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.276322639055692
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1583.6456204381843
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.858415802520814
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3954.8761975838347
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.6554086394806005
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1394.2467816351652
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1058.1355886819447
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4301118241576862
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1027.2830468463774
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.5146360813873
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7984515073983114
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
571.4231964133371
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
402.70686360128667
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7550059923990475
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
488.1507790118762
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.130226674859735
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2356.9294677317653
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7208137970151074
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3813.834141940485
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.780228722681265
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3541.5187817290853
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.725102152942843
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3818.229706766414
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4640805771823186
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.6750597723741
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.1043586232732
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9164337604485474
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.13638885831116
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.784110814199235
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
751.9344058459006
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.8762557901536905
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
633.9132527199797
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.442459963324411
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
837.5197952321734
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.904980357609766
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1547.6474464892697
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.476294345781827
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.09700387829105
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
63.300770561670696
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2088.925428535133
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.825604547194065
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
439.0352773022991
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.216153014745693
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.4071483671966
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.39732106540725
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2113207256281036
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1017.6354247442897
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
734.0222826592344
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.6010404689253526
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1078.4744096916309
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
818.5104968526177
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.029657788221932
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3099.825854680629
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.33585309831093
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
866.8304513902103
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7740172184751862
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
100.62223840177421
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.423319793632783
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
445.0315731722618
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.761271255797253
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3536.6467127398937
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.947787454267922
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
279.458290182411
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.59255916978788
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.689285855484124
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
869.6071612129363
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.226010900686816
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
898.4583597226649
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.005201041907341
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4105.331067955024
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3837006310360183
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1072.2608929690039
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
773.4485460426338
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.0013078585562187
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1414.7144800562107
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.52985327532366
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
451.0075056805022
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.0188662957453924
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
132.45261844690103
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4918979317303251
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02829927359414
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.08250801637695
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.727016855733195
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3527.8433319234314
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.958917231029188
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2318.591340540007
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.345031764732184
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
869.386048236162
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8116704427990968
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.5171575638826
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.835751656963947
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
888.6477154053132
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.056487863213853
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
991.8640994860572
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.963083231703099
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2320.539195815101
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2396.267635001095
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10097.659225599456
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17760.128145880317
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.31681075599772
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
83.53453349991469
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286.14248570011114
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
464.5410591595825
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17.99196292170605
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.927278076729973
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.401611523614008
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
54.56936911312645
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81555.26053250014
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119099.78865240046
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
149561.21620579006
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
60990.40195425867
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74366.79442150035
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91057.90338040024
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119139.42147124012
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71.95794359648812
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.09010211777561
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91.67779503677629
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
162.1993299215472
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5093.61046199956
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19799.87070710049
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34911.206816069964
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
89.0074777467089
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.78020099933201
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.39738280070014
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
305.60825724889554
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.53962367356868
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.013224575494444
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.49448672562282
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.86796224875606
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6037.00582249985
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20610.823792499858
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39697.10515961993
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120.91360992670161
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
78.19494699970164
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
261.1417957004961
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
398.08468384981984
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.61002177476953
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.212595116812984
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.00419337069255
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61.25648014376277
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2082.115367499682
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8389.760217999903
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15689.799519069576
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
137.8300099867071
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
104.72733849928773
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.29186799962207
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
478.5203736389299
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.243654897537015
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.547955335700586
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20.99199404575449
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42.747832071507176
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3890.4319739995117
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13193.985314599924
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27281.57391681881
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146.28363902658748
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.26879349991214
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
322.14451910040225
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
497.0128824293888
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.283618091257267
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.525428130988722
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.269580823781663
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.24540862014753
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6000.312235499223
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25277.507448698456
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43387.36960866058
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120.43331071590364
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.7387824995094
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
241.336593698361
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
382.8916655597275
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
54.757721723626766
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.98806995682375
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82.34548950409588
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
134.63501169936671
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4737.434841499635
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15560.49583099958
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32725.33266103244
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.00420097315266
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
71.49125849900884
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
199.34117749980948
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
307.1083863612508
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.34208241022537
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32.40595485797562
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.70256272738732
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.2118572482736
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17065.19430049775
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74950.11523039905
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146351.23040261026
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250.6739896193612
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160.44838349989732
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
428.3404127996619
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2079.1166423215323
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160.6462147193807
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
148.21400922072317
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.04812206047916
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
376.53776184513737
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87768.38190950092
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
167819.06054989996
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
251270.820713719
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42560.90674879399
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40620.50408849973
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92149.70068920084
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
103243.40170894029
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216.2914954467523
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
225.93440729787284
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
269.3413871816564
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
359.7722747882618
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4080.647298000258
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15603.21051779938
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27824.95839482965
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.77041190003365
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68.529375500475
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
274.98125969887036
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
430.9434738189336
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.410234227166207
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.19370990704419
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.66025077509929
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.8356805020829
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
673371.4527115007
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1206091.8269903003
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1319723.3410466285
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
649974.6918371114
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
656117.8117640002
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1181947.7701799
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1306456.517616721
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
76.64008308004937
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.70125935488325
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.81345100574721
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216.97303392337335
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
301566.2592715007
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531746.1328627993
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
599760.8188527006
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245955.1311113956
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250553.67515549914
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
474156.4131383009
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531585.9743654092
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.2793489163629
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249.07135179506352
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
323.8395446324989
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1379.30769644357
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14938.15822849956
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69041.30388119956
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
141481.09384481012
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
203.1534606753533
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
163.01583449967438
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
411.52968600054015
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
714.5214555409799
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.70450548907097
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.90623933857708
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
166.96728933949345
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
233.01746302727017
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7848.359845000232
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31969.388531300094
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56479.224581430295
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
157.08337428266591
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.36603449972972
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
326.7037606996837
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
540.8171034202586
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
58.43195305783992
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.415408974952975
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.43155661579102
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.43715617379418
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1910.9232499995414
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7451.285555898721
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13595.16302421919
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
78.93666291988968
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.90165099892329
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
194.07279209972327
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
333.98297453037185
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.880222939468224
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.974422112991885
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.976017341607632
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.555808360027898
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5791.760614999475
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24457.673812599027
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41150.95758919045
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
189.1774515653257
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.05626100087102
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
408.24815519936243
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
685.793470030676
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.09954253296722
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41.00664992247881
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.07884792403113
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.35443237796964
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1786.0784015001627
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6674.975513099344
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12817.257288289311
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
93.02741878667196
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.63645699880726
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220.93406880030673
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
372.8474575402827
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.384583059062813
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.679806085758255
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.58040277782758
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.531875924196036
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6108.006427999953
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22952.280707499354
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41483.70775330957
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
103.72301187338962
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.26208700001735
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
237.93894480004383
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
371.9365236497742
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34.55010430330312
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
35.068637349023895
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.3254489459134
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.667581304521924
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
171443.19788350002
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
305906.30051039933
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
370844.0582996699
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120597.7775865393
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117002.88137250027
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
246070.5171330008
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
277570.262639239
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
229.433921368436
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.29315960395814
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
261.7851532227475
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
471.04519275028014
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286462.5011614999
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
499453.79468050017
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
583764.090154119
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216599.2159575767
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212849.95533899928
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
427777.7183663984
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
480707.610385181
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
332.8798635960917
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
308.01502233720066
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
355.2340049985275
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1331.5692118570732
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263575.9046779999
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
469364.62291590014
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
511579.08613112976
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
247188.276827232
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250547.18437850033
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
456097.5446625995
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
489723.5376501299
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.75846829294854
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.67532989308103
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87.51954123449036
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
150.0226842143369
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6361.386908999975
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24500.299073699807
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42715.80671038984
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
110.70105875330532
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.69098899980963
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
227.85748270007386
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
373.789667979772
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.19108742625828
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.92992101845952
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.94249036441508
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.58739806420929
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12454.246932999922
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53405.81969289924
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
108060.53358204055
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.16191845068775
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
178.14556699977402
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
397.95014429910225
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
651.1644661993159
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
97.12101858523658
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
102.56699466106447
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
138.86975077746806
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
200.47448387021174
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81714.6531670005
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
187521.7248965992
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
297025.6627078605
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30301.363911058637
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23372.575817500547
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
73269.03860769907
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81287.90963032878
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
282.32142529592244
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
303.82692936498887
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
332.2685562277847
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
416.39093613039483
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249700.43319699971
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441017.5880870995
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
500277.46868850954
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220279.097000711
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
225967.78826799983
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
411950.9603787004
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470389.2524246803
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.26662626166932
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
113.37217710107508
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
158.8114873121734
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
488.6922014188371
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6025.799117499901
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20366.530845399713
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41192.23769952996
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
128.82963457335183
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
84.09444049993908
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.1642989000975
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
424.0826311999441
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.458798810596804
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.33653015811997
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.338119202097275
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68.9952792260916
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
66359.58726600006
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
114597.87733469976
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161448.99897119025
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43224.27611526067
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42590.92600500003
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80011.77280270022
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
84832.02225194017
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
105.4444210604383
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107.75818540794252
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
124.61177133045928
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
175.04706178281742
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2042.77660849948
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7992.408615099521
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14350.542385940324
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117.60475319994536
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86.38719900045544
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
259.54431380068854
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
397.212828620304
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.492185673687134
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.53481035598854
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.317324717490859
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.484928914177845
msThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.010368877841048
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4109.319234332457
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7951800533999287
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
570.3837551534616
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
401.9695368370985
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3076836400116463
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
990.6170707858746
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
751.5465177755395
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
50.44652489175856
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
857.5909231598955
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.50320058015462
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.5544098626285
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.9661382558174205
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2321.967602889993
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.53812589216917
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
451.14814016687586
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.766905847862782
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4052.094802900735
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.0191451634429545
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
132.4888712475841
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.901072800210121
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1547.1394640273156
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8009334510749053
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
527.2646892715533
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.15033364897727
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2162483451055404
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
684.4365889244604
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.4132803627979
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.946852245248529
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1553.0907918823088
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46614287204141847
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.05124783829343
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.11128091785461
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.991717218963775
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441.8591927223842
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7764135206165811
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
100.93375768015554
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.783209765661159
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3542.2849097749177
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.772129919212677
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3539.437389237658
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.967795144325056
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3091.845573617932
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4748520198409185
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.5223829247484
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.98132069577333
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3694922743439544
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1067.7584068168558
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
770.2142683668214
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
21.436404546794446
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2786.732591083278
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8108261809132455
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.40740351872194
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.018665762553849
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4136.246147472836
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
28.65892180299731
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3697.0009125866527
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7829199523541769
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.779593806043
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.6259351439023115
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3098.022235882965
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0176244255285267
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4134.112447907952
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.881568271158798
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1682.301937625322
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.724456194291118
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3817.5675991483954
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.117894097747433
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4164.479672144434
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.075614988109056
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2349.8299484541776
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9912838573396279
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
258.8669014541516
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.099965112378314
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3108.8954994968026
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.322657975910165
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
865.9727684341607
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9166487828876995
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.16434177540093
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.0056401769631385
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
390.733223005208
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.558322563078341
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
852.5819332001843
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.85457656750311
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
985.2010267276025
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7278030529809305
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3820.998129305454
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
63.162950442228166
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2084.3773645935294
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.21836843464353
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1019.8687732541882
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
735.6160363696024
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839405910873731
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.11816308408225
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.38685638817975
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.74844091622052
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1397.2973191086676
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.956495770848747
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2317.4591626180404
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9138774608885251
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.80406991550825
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.423017885966531
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.99232517564906
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2412035927289767
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.356467054766973
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.078555227254824
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3106.133624315872
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.98276879143164
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4062.69325364343
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839653888556055
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1254747887048
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40860849225956
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.264256237316825
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
866.7204558314553
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9642204360084584
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.303609625214
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
184.04075462093448
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.402404095026347
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3797.4333007485166
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.628596707888258
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1386.2168851798851
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
930.395767118188
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4221954279796476
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.88540563735415
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.941514910977355
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2310.454711776572
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46404412092744846
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.66542292596172
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.09531649989862
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.709687904223381
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1392.2594275490396
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9.369739418321567
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2408.0230305086425
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.783521245027302
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1675.9288809267746
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
49.09090516390551
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3190.908835653858
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.6885725248091115
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
869.5144282251845
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2974010023370317
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
709.4986935484044
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
459.6670557489298
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.87256051155128
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1681.7164332508335
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.25715295370262986
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
33.42988398134188
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.084360644193112
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3619.6806855576297
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.91759575658036
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1549.287448355447
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24116300492556422
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.35119064032335
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9157272353551662
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.0445405961716
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.835474893883872
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
888.6117362049033
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8102760206533008
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.3358826849291
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.138989320502026
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1311.604325773887
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
832.4114575296582
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.26155301170061
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
899.6312493861201
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.49188776998467015
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.0256131177477
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.29969587718848
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19.467339332158325
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
642.4221979612247
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.002912804168443
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1415.4709968496625
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.87773720907574
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
634.1058371798462
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.6001256271340263
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1078.2004240284937
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
818.3025548381855
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.3814730526138
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1388.4449956427873
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1001.4908215281469
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3549427861155356
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
436.14256219501965
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.839591026350567
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
889.1468334255737
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.453385424706196
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
757.6708427605988
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
563.2907511514101
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.438311699779716
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
836.980520971363
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6495432882079235
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
537.9776492340303
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
374.92799306335536
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.521398339519947
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
847.7817841375931
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.752548655116403
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
487.8313251651324
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.647157901856286
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1391.775770974305
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1056.291245828764
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4832725499680104
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.74826585854386
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.80648695406956
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4919036463101583
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.02980986562724
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.08064694125508
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.741934342197581
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2945.612317547359
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24119207037065368
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.354969148184978
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.3880744037833335
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3790.08216914085
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.31079419259806
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
498.283501274167
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.361715764122188
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1648.5115246679422
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.778429050387001
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
751.1957765503103
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9681275055481615
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.67303251266566
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.0546248787846
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.778520945302646
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
361.2077228893439
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.225104803742613
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
687.1716995238187
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
510.7713242959143
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.772587515874461
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.43637706367994
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
64.14717105532294
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1090.5019079404901
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.3878901339623475
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3789.987638722684
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.726539857107652
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3819.703353535343
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.961365647324101
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2319.7361220628563
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.921090482265752
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3936.314398162526
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2045018636085807
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1015.4745955589232
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
732.307720213863
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.903412710195009
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1498.9104807148833
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.276972828063204
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1583.7294948201534
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.75247446532987
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3534.3859375897764
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4284606721063158
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1026.7885438688963
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
779.142829781089
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0623291638724757
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4225.712456774702
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.05030284293543
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3102.4890667386708
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.465057529978627
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3551.6839682280925
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.859070124175116
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3955.546877279494
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.154912133297404
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
930.1385773286626
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.979767673266777
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.88776435495345
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
211.47305459790115
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.329870212033165
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
719.5260513481622
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
534.1802162404998
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0165065011354053
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4131.821820826445
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.83441927542606
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
918.53583608906
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9479308698353432
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
279.5005767405171
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.62351733222332
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.000704620093486
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4100.722235595822
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263306.0669235001
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470129.4410703
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
513449.96094310057
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
247524.74156152934
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249297.22716449987
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
458282.0205674005
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
491290.54715776985
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.99019985855405
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.94658844029993
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87.29886640224683
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160.5282467157674
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
300781.012593
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
530737.0957537001
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
598647.8824441774
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245274.15263476002
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249739.50295050052
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
473127.487509499
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
530582.2039851011
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
283.9943717720441
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
248.80474034252686
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
322.404419731337
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1381.287970888807
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
672861.0211445007
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1205454.4710467996
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1319091.7911802796
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
649585.5741784044
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
655599.6745675001
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1181285.5905302002
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1305815.97616491
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
76.60429510531986
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.66506573431084
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.63901813074627
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216.74209568259906
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12334.632447000331
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
52938.1639528001
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107922.35028546068
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
203.26328601467927
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
172.62611000023753
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
387.84450590119377
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
663.8284933493738
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
96.39305317815956
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.55257140957852
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
138.08504960053898
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
200.07200996813964
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6100.944481999704
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22938.445371199487
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41489.32190939026
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
103.66350760665227
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68.3899184996335
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
233.2615367996368
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
334.7732556202206
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34.586766399858384
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
35.05351811159287
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.80920364968689
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.565452221738695
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5101.874079500703
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19747.436375001413
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34904.99505718103
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88.18914504006293
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.58470000041416
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
176.08951720067122
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
293.7111236008785
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.5131690964038
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.94630224748236
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.50781091459303
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.7404266475195
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67525.2811050002
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115909.50133349985
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
163418.45490600995
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44172.205766978
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43873.443242500114
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81035.26023329982
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86599.0596419804
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
105.71914868752255
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107.79071143465427
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.62799162626895
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
182.51912273501353
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80791.01661249934
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
186823.43913750123
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
296056.4090643901
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29839.30657682002
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22894.279103000372
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72217.8238710001
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80325.04383425854
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
281.81249132589096
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
302.69224509340467
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
329.9889365026948
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
413.0344454082397
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1783.6319284997444
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6664.789913099049
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12807.225365680279
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92.91593044330511
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.986114500134136
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
221.52615709910611
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
367.0959500691963
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.352028896586393
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.535055402157433
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.473368211403
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.523730995010723
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2084.2730744998335
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8385.103225099558
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15681.5874813696
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
137.86632455665918
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
103.84461400008149
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
289.07355589963123
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
478.54449177117164
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.167668194810204
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.527593341046591
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20.5667223182084
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42.62795360417554
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4752.327209500436
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15600.163461898783
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32792.836014562236
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.38961398332079
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.11006449950219
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
196.53639099924482
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
312.14099991037045
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.413203500043664
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32.46719014171421
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.664469800268634
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.20678384232848
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
158294.9148689986
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
281953.6927440982
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
338371.69625422073
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120475.81167315936
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121964.681114001
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
242086.52510010143
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
273331.23482915305
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
208.63488499396468
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.914287833364
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
325.0766189956743
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
513.501414064069
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6343.981642500239
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24449.37289029999
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42716.55858805991
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
109.20952638000017
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68.66251399992507
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220.33817390001784
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
376.7150706801652
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.15909662004622
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.90228367973053
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.79461607417704
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.51975930295615
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5923.304808000466
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24646.616601799193
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42672.76925111086
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.41436013459072
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91.10782350035151
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
240.63978399972262
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
415.41440013024817
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53.99044712668971
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.95942817759779
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81.41324515965226
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
134.20522760908443
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17195.21816249835
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
75256.91730309774
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
147684.65092573824
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
270.3640048479865
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
168.78275949966337
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
460.945160101619
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2510.4749118312247
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161.8209414821089
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
149.8339512482751
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.97594490206797
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
383.24868955829356
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1906.8628505010565
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7463.094820699553
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13566.463512420003
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
79.51437947999996
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.52940199926525
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
194.90781479926227
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
337.2531594093193
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.85803878123841
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.918934129264372
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.946915803654552
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.527318144407467
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249626.07975599985
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441174.1165095996
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
500566.29144561995
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
220327.28552123602
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
226029.63930999977
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
412251.18380329973
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470739.1088548903
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.30170142285377
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
113.32789414170486
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
159.00204443141365
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
479.025332534477
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14904.017294000369
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69000.56763830039
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
141870.86356268002
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
206.15468922134824
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
159.72396000051958
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
426.1613975986621
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
738.265142030541
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.15958643246923
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.10992801428588
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
167.9031341929892
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245.78518999631567
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2378.9064804996087
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9957.45510699999
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17753.63427355982
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
114.94765159330564
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80.09512149965303
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
291.55616720108816
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
464.8795398207401
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.005245424115135
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.845850328841857
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.319749309619425
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56.75295044209441
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82179.28584050015
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120522.0900694001
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
150125.45319789017
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61610.72537380799
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74438.69021750015
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92457.59311679968
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120858.66404898032
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.5997502415365
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.6647588689569
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91.07393194385172
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161.36966161531052
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
171791.82208299972
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
306866.9189862996
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
371864.7533321803
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120868.66630140203
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117189.04287950046
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
246710.66200019923
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
278655.7714260487
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
229.14583992169747
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.64847192215814
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.40776249602254
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
470.7793708931354
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4074.262064499635
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15623.15562279982
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27733.19549841983
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120.86956716009202
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
65.40768800005026
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
255.60794210086885
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
437.2247466800218
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.408384997850007
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.20034088972401
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.721696934737196
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.737810816501955
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2038.9721195006132
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7996.891902400239
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14333.083189290637
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.81236279337706
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
84.89836699936859
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
236.34579030076551
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
407.48633134946425
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.522339154660072
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.526187552956484
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.297506713383429
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
16.843358447401
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6034.362005500043
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20708.196883199824
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39735.561204839214
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.11195449000357
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80.31305000031352
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250.60274579946056
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
394.8763602395956
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.671310027749186
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.19027717808926
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.25501383573721
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
61.38009377687918
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7890.128129000004
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31966.800546900005
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56354.42562206975
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
156.81473583333536
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
131.6572629998518
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
325.17226609988944
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
543.3021985500588
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
58.15767134820379
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.35754773882319
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.49756977402747
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.06240428414705
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88769.13033550045
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
168109.79201910069
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
251517.12992112996
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43033.72525619067
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41976.15632650013
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
92539.37444680014
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
103402.9854613108
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
217.75947179230627
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
226.63255090123457
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
268.7982368320158
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
368.7307331176758
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286784.06463300146
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
499938.17337180057
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
584041.3591660302
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216765.79819467658
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212999.5754395004
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
428146.90473249974
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
480951.1477677012
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
333.34167208619385
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
308.15338186618715
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
355.3946382722181
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1331.3391278835647
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3891.4909499990245
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13187.406758199908
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27241.32537161974
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146.59046096667834
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.48130249995302
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
319.2415596995488
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
500.63240388981177
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.21991625509327
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.355494295651543
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.271030266476497
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.2552846065199
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5851.5573519998725
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24757.80192139919
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41484.280505690214
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
190.0270900746594
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
182.23212200064154
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
405.10897460007993
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
681.8767160812239
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.4305657386916
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41.15443080530663
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
60.007494788539184
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.8491961521776
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5997.540035499924
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20054.12083260022
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41125.755409499965
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.97915273000245
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86.51866200011682
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
255.85874499979587
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
424.17089712022494
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.45593432617323
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.25185869854398
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.20893015526626
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.56061411615849
msThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
bigger_is_better
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46619889417344884
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.06639696975621
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.12547580566847
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.046439177495504
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
991.5324928573516
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.426415053428332
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1026.1759022129
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
778.6722372435578
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.746498549253211
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2947.953755766897
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.929938782329437
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1550.8920417028269
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.0989351878911
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2352.8615744258427
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.96462063397971
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2321.258023623553
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9171267740938752
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.22648063220377
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.790315599910969
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
568.8381691034455
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
400.8767272384647
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.724607248009644
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3817.722429209885
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
49.04573671514668
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3187.9728864845342
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.42458210987864
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
445.19567428422323
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.6687833229306541
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
544.2525428517424
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
379.32334947986
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.777181397707066
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3540.7356192107163
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.366165031645636
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1648.8007270569663
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.003243956680598
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4103.325055597613
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.608129034698503
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
859.0567745108054
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.003150082855486
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1415.5828419552588
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.872128164392665
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
633.3766613710465
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.441425851876066
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
837.3853607438886
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839354682738332
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.11665260543396
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.38247082690688
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.46411045716008736
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.68295824569749
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115.11176965455927
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.6521155859685415
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1393.260546136523
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1057.5421755802988
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8112491447601737
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.46238881882257
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.90667473996263
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1547.8677161951418
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.9652093831115565
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2321.5332991676396
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.9285785558385005
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2304.406189567849
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9795759109766556
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.83122260450347
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
211.43166462520134
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.127392851647246
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4169.352532895037
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.226846808682331
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
687.7096771026686
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
511.1712003684899
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.329351605504623
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
719.365891822641
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
534.1234289443507
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7233250059103753
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3816.408131058135
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.019215928153349
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
132.49807065993537
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.79187187078181
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
884.1317717357998
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9839423464815629
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.1186806679104
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
212.40363433497495
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9221295239174523
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3938.44339450686
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.7726608657384142
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.44591254599385
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
21.446142502062877
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2787.998525268174
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.835352475559249
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
888.5958218227023
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.80122944286292
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1677.0799137860897
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.2411075290161781
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.343978772103153
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.0059721632891376
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
390.7763812275879
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.841233442693937
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
889.3603475502118
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7239786988757624
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3817.0781663476564
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.017303664255409
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4133.455208059333
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
28.688462744556837
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3700.811694047832
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.453223244974666
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
757.6207573347095
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
563.3287472970426
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.218472219108833
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
685.1233805199839
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
508.9323168783604
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.741449816480037
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1396.388476142405
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2103391227966993
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1017.3243646230461
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
733.795774201969
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8.010013606396791
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4109.136980081554
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5.78861255802945
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
752.5196325438285
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.401305606561812
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3796.8697761662097
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.052410781374185
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3102.7609907972696
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9637991596437262
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.1793948848182
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
183.96034560119801
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.304190538765596
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
989.570923058062
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
750.7528420908548
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.871031047263404
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1681.6170180721213
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.78100883329819
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4055.7192701576346
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.497024988778815
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
501.4494248092399
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.906470213723028
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1500.0845620696427
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9681718741664802
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
288.6862621993805
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
210.0642515379012
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.392223510381843
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1391.8517082049023
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1003.9861625991093
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9478358671309688
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
279.47256487645654
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.60616918467178
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.7797846915983464
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
361.372009907785
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.858479908651586
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3954.9419063678756
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24135717980559152
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.376433374726897
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0626753347472864
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4226.421760897189
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.5378639044522
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
451.14368637568737
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
9.371143510644119
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2408.3838822355383
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.2192968736917926
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1020.1629863041921
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
735.7166455343643
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.982335971107362
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4061.8064047989847
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4662449355423144
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3552.9010589308723
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
63.287794597340515
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2088.4972217122368
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9173865174103514
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.26024726334569
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.24122656492691877
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31.35945344049944
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0159274232570774
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4130.635290253752
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.084788142906035
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3619.7905527268513
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.0176828840187464
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4134.232229354411
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2.2993500094275015
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
710.1005989114639
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
460.535280488228
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.496799571772446
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
450.4455927201316
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.635977275083382
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1388.4272887889638
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
931.3740158672849
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.25717479579843955
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
33.432723453797145
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.78180844882916
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.6350983477908
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.753571320899072
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3534.6678294710614
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.49192736369027046
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.03607931788608
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.08653312065132
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.224513777287009
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
939.1867910473111
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.280676117769737
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
900.2623118864012
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.28388787137959
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1584.6215354079673
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.355651433355742
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
436.23468633624645
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.721655529009427
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1393.8152187712253
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.3726531815598206
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1068.7600667044915
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
770.9233126763982
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.740550486369159
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3531.321474996874
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.9070622900479829
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117.91809770623777
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.10927558125536
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3110.0965499819413
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.691377077425007
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
869.879020065251
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.7778780693372582
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101.12414901384358
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.8107647635654032
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.3994192635024
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.284278692060553
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
900.3811968379982
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.0092608511988
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3097.1946498046455
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.49190747549605807
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.03082207262798
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
122.00945083887227
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.474924613198432
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.54157225287351
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.99645253916916
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.398675326442591
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3795.520442465049
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.89317202065768
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1683.0561813427491
tokens/s{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.144547682568097
prompts/s{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1313.3657151290045
tokens/s{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
833.4961692823282
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.334752151028558
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
866.7588898168563
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.955302307023388
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1554.1892999130403
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27.790396163560786
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 32,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
917.0830733975059
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
64.2614381535978
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1092.4444486111624
tokens/s{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
0.4833819933105497
prompts/s{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
127.77719611171072
tokens/s{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.83361868830735
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.5247764249418925
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
848.220935242446
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1.9933846113019678
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
259.1399994692558
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.7536497067473453
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
487.97446187715485
tokens/s{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3.4228620701891135
prompts/s{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
444.9720691245847
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.967894851695444
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
441.45421247882257
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.093783413464813
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3108.0980603369608
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7.398023973089571
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3795.18629819495
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6.606313709379528
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3088.848037957492
tokens/s{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4.966719696373579
prompts/s{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2322.2394612364305
tokens/s{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
50.193259112663526
prompts/s{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 16,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
853.2854049152799
tokens/sThis comment was automatically generated by workflow using github-action-benchmark.
d630f71
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
smaller_is_better
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6093.884475500545
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
22960.311056000046
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41497.53219838001
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.24920937334537
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.46401249984046
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
231.0143671997138
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
365.1539538300312
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34.55063468946149
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
35.08914712940464
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.3015932409114
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
47.58324042865122
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
286884.45353600034
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
500291.90099169954
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
584822.9429875511
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216900.5861523663
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
213081.19186299972
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
428693.4078402006
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
481474.1398130788
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
332.9650503783716
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
308.39073081971947
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
355.32343351911976
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1327.3088679858222
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
263677.7010845003
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
472774.09820529976
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
515489.9591982099
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
248153.55264034402
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250679.83783850013
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
459607.94330120017
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
493302.0551654201
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.90664879746807
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.05288553534403
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88.08754917460423
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
148.9090535517682
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80422.4308785001
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115966.97935899983
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
144821.51162855045
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59877.320010357325
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72913.35838150007
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88782.80542819948
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115192.01214167022
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.45302212985872
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.17640472695027
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
91.36970949873921
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
184.26467997545203
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1785.4301545003182
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6655.088243599949
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12792.91924167075
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
93.56096721334627
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.44456499996886
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
223.7398488005056
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
373.1567245598847
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12.41124021893168
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.583252690466265
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.355790504793955
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
29.412175007505525
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6341.34573849974
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24471.513644600236
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42697.68872629975
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
110.44257595330843
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.3398400001588
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
223.65094079978007
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
380.2927610400045
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.13204775223976
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.87762082018995
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.66799327771251
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
48.5130423531519
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
171429.54943049973
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
306376.506395899
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
371281.0633969692
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120617.83269621164
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117288.11862749899
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
246360.85341809975
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
277955.5440923803
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
228.60712424647576
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
230.39230390918044
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
262.4699188141633
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
446.575535218762
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
3895.272658999602
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13152.433094400114
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27240.78344926961
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146.87068273656524
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
99.83540850043937
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
320.6651283004249
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
512.6683834004499
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
26.195951796887964
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.327625733993656
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.064506808162726
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.16934705535494
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7809.179186000165
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
31832.74222130004
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
56373.085072329784
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
156.7249292653426
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
130.6289619999461
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
324.85720830009086
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
549.3050363700422
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.98611598881838
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.2781114699141
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.59864219013346
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117.98698941758141
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5915.349460498874
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24775.168203599787
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41617.5122451698
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
188.57465544401202
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
179.99150099967665
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
402.8597456013813
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
674.1861478500866
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.61668011218307
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41.31814184946734
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
59.96609222577642
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
117.17002776943053
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2076.3950609998574
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
8606.875981899977
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15706.292354279096
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
137.86848546998348
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.53052700035914
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
290.419286200995
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
478.68368762006827
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.201044026724873
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.507361060685726
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20.84380651624178
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42.70692799275635
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2384.127324498877
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10080.71269129887
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17771.033101320205
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.1529575373376
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
82.896624499881
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
289.65138819930877
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
478.42477794059954
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
18.00452473810587
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15.90280875088352
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24.423464671032253
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
54.39054578346157
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
12957.832245499958
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55152.37132260063
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
112687.51845650995
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
204.5032126666265
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.54516399963904
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
406.1788343995432
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
658.5754671101312
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
100.84511502424486
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.09437110473074
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
139.08139648326375
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
199.67488469320958
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81791.48188649924
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
187908.4774205991
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
297351.06882022985
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30304.43374028402
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23156.789003500307
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
73495.72729080045
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81698.39463400838
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
282.1266030985424
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
303.2967520614241
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
331.7650844743305
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
413.54837055241177
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4741.9559824993485
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15538.54652010087
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32734.45059720717
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
98.43498807674527
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.1817219996883
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
195.11243870001644
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
311.2737445286981
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
37.37754382746866
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
32.412963010154584
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
57.63222327289194
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
70.14206825351191
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
301594.19004899974
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531771.2816548987
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
599753.0367116593
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
245842.26863272468
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250254.6559735001
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
474057.67655949894
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
531619.2467030084
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
284.52705613662863
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
249.0414332751811
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
325.53991394246236
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1379.754885390026
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6038.425479499892
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20630.771863000275
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39752.375158729774
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
120.96395991671064
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
77.84736699977657
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
256.5057658001935
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
419.42596272040646
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.694889262677975
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
39.23232140136024
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.01677895111989
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
62.174139208251965
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14800.069561501005
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
68021.36952850051
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
138525.19005052012
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
201.00579969601435
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
162.07331450095808
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
411.4624976002234
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
700.7723861610247
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119.84537562849214
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
125.65878673862326
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
164.7636834147106
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.9988636839045
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
6012.193654000157
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
20321.761147100096
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
41151.444192029936
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
128.04075687333807
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
84.97845800002324
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
251.85158649997004
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
418.1143507800605
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.479335663407106
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40.24862740512492
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
44.25060399439083
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
72.57616124828652
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
87160.22394899937
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
166701.7161822012
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
250744.69612292983
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42194.35737612533
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
40646.52843250042
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
90865.38426139996
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
101404.48377591911
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
216.23582893848254
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
225.6724421200404
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
269.6881002100054
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
368.8271975075437
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5886.534340999788
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
24513.308360800525
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42431.76565360182
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
118.16208571209184
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
88.46283800085075
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
247.44117009831825
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
396.38122040087177
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
53.67971873039024
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
46.62967358774543
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
81.24246871756063
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
134.42290758462786
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
157674.67040600057
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
280666.1020246011
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
337616.15788235015
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
119900.09034217305
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121392.73567750024
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
241485.7874621983
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
271794.0721733926
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
207.91589210840735
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
176.5195705934088
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
320.901716883418
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"3000,10\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
533.4729810858239
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2038.5787060004077
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7995.321425700422
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14325.602640559435
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
116.43372079335677
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
86.872200500693
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
239.00652560023434
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
408.7744996800029
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.522607706762319
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.50892936162238
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13.364718767123499
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
16.840061222402337
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67038.4366185001
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
115512.77184940009
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
161628.50754860972
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
43662.238786318005
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
42964.62512950006
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
80590.34398830021
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
85720.94017564027
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
106.0712110239474
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
107.53078394527432
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
126.56840543784598
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
177.5728662664396
ms{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
1906.3891264995618
ms{"name": "p90_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
7464.751167699796
ms{"name": "p99_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
13541.99787623027
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
79.05199823332926
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
38.57638850013245
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
188.9380070996594
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
336.19911778965565
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.86945611581364
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
10.964521730199017
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
11.94302064933402
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
14.505217113225541
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
5069.163692999609
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
19729.109443098423
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
34887.511264609704
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
89.93647531317644
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
60.53061599959619
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
175.433897498624
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
299.69350852177723
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.50940051949978
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
30.97476959589593
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
55.59491804321682
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
67.70402774750427
ms{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
17051.294677999977
ms{"name": "p90_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
74455.01032550239
ms{"name": "p99_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
146385.86126928037
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
235.37690458467114
ms{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160.87861499909195
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
402.70234779818577
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
2054.49705155188
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
160.5281659031224
ms{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
148.02809475031927
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
261.01431393027644
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
377.13890468607855
ms{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
4081.0506114994496
ms{"name": "p90_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
15614.435634300933
ms{"name": "p99_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
27716.863954170756
ms{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
121.47278800001611
ms{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
69.7743159989841
ms{"name": "p90_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
274.6741847997327
ms{"name": "p99_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
429.92728991022847
ms{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.399777075594095
ms{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
23.174831670868112
ms{"name": "p90_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
25.762381751964462
ms{"name": "p99_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"}
36.74638201110215
msThis comment was automatically generated by workflow using github-action-benchmark.