Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Upstream sync v0.4.0.post1 (merged with upstream-v0.4.0.post1) (#157)
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin authored Apr 4, 2024
1 parent 5d256f5 commit 3d151aa
Show file tree
Hide file tree
Showing 297 changed files with 16,469 additions and 2,466 deletions.
18 changes: 18 additions & 0 deletions .buildkite/download-images.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

set -ex
set -o pipefail

(which wget && which curl) || (apt-get update && apt-get install -y wget curl)

# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
mkdir -p images
cd images
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg

cd -
9 changes: 6 additions & 3 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend openai \
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
--backend vllm \
--dataset-name sharegpt \
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--model meta-llama/Llama-2-7b-chat-hf \
--num-prompts 20 \
--endpoint /v1/completions \
Expand All @@ -48,7 +49,9 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
Expand Down
14 changes: 14 additions & 0 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex

# Try building the docker image
docker build -t cpu-test -f Dockerfile.cpu .

# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
30 changes: 24 additions & 6 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,53 @@ steps:
command: pytest -v -s async_engine

- label: Basic Correctness Test
command: pytest -v -s --forked basic_correctness
command: pytest -v -s basic_correctness

- label: Core Test
command: pytest -v -s core

- label: Distributed Comm Ops Test
command: pytest -v -s --forked test_comm_ops.py
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.

- label: Distributed Correctness Test
command: pytest -v -s --forked test_basic_distributed_correctness.py
- label: Distributed Tests
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.
commands:
- pytest -v -s test_pynccl.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py

- label: Engine Test
command: pytest -v -s engine tokenization test_sequence.py test_config.py

- label: Entrypoints Test
command: pytest -v -s entrypoints

- label: Examples Test
working_dir: "/vllm-workspace/examples"
commands:
# install aws cli for llava_example.py
- pip install awscli
- python3 offline_inference.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py

- label: Kernels Test %N
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4

- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py

- label: Llava Test
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py

- label: Prefix Caching Test
commands:
Expand Down
5 changes: 5 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ steps:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh

- label: ":docker: build image"
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
Expand Down Expand Up @@ -53,6 +56,8 @@ steps:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
{% endif %}
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
Expand Down
5 changes: 2 additions & 3 deletions .github/scripts/lm_eval_compare_hf_vs_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
import os
from typing import Dict, List, Tuple

import numpy as np
import scipy.stats

import lm_eval
import lm_eval.models.utils
import numpy as np
import scipy.stats

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Expand Down
13 changes: 8 additions & 5 deletions .github/scripts/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ do
done
echo "..."

# download required artifacts for testing
(cd ${TEST_DIR} && bash ../.buildkite/download-images.sh)

# run selected tests
SUCCESS=0
CC_PYTEST_FLAGS="--cov=${SRC_DIR} --cov=${TEST_DIR} --cov-report=html:cc-vllm-html --cov-append"
Expand All @@ -108,12 +111,12 @@ do

# this is a bit messy and brittle, but certain tests
# need to be run with specific options
if [[ "${TEST}" == *"kernels/test_pos_encoding"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"distributed"* || "${TEST}" == *"models_logprobs"* || "${TEST}" == *"basic_correctness"* ]]; then
pytest ${CC_PYTEST_FLAGS} --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
else
pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
fi
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
uses: ./.github/workflows/build-test.yml
with:
build_label: aws-avx2-192G-4-a10g-96G
timeout: 240
timeout: 480
gitref: ${{ github.ref }}
Gi_per_thread: 4
nvcc_threads: 8
Expand All @@ -33,7 +33,7 @@ jobs:
uses: ./.github/workflows/build-test.yml
with:
build_label: aws-avx2-32G-a10g-24G
timeout: 300
timeout: 480
gitref: ${{ github.ref }}
Gi_per_thread: 12
nvcc_threads: 1
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/remote-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
uses: ./.github/workflows/build-test.yml
with:
build_label: aws-avx512-192G-4-T4-64G
timeout: 240
timeout: 360
gitref: '${{ github.ref }}'
Gi_per_thread: 4
nvcc_threads: 8
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
- name: Spelling check with codespell
run: |
codespell --toml pyproject.toml
codespell --toml pyproject.toml
- name: Run isort
run: |
isort . --check-only
3 changes: 2 additions & 1 deletion .github/workflows/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements.txt
export MAX_JOBS=1
# Make sure punica is built for the release (for LoRA)
export VLLM_INSTALL_PUNICA_KERNELS=1

# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
# Build
$python_executable setup.py bdist_wheel --dist-dir=dist
23 changes: 20 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ cmake_minimum_required(VERSION 3.21)

project(vllm_extensions LANGUAGES CXX)

option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")

message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

Expand Down Expand Up @@ -70,10 +73,24 @@ find_package(Torch REQUIRED)
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by VLLM extensions for this
# reason. So, add it by manually using `append_torchlib_if_found` from
# torch's cmake setup.
# reason. So, add it by manually with `find_library` using torch's
# installed library path.
#
find_library(torch_python_LIBRARY torch_python PATHS
"${TORCH_INSTALL_PREFIX}/lib")

#
append_torchlib_if_found(torch_python)
# Forward the non-CUDA device extensions to external CMake scripts.
#
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
else()
message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
endif()
return()
endif()

#
# Set up GPU language and check the torch version and warn if it isn't
Expand Down
11 changes: 9 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

# copy input files
COPY csrc csrc
COPY setup.py setup.py
Expand All @@ -56,7 +59,9 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

RUN python3 setup.py build_ext --inplace
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################

#################### FLASH_ATTENTION Build IMAGE ####################
Expand Down Expand Up @@ -97,7 +102,7 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal

#################### RUNTIME BASE IMAGE ####################
# We used base cuda image because pytorch installs its own cuda libraries.
# However cupy depends on cuda libraries so we had to switch to the runtime image
# However pynccl depends on cuda libraries so we had to switch to the runtime image
# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base

Expand Down Expand Up @@ -131,5 +136,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENV VLLM_USAGE_SOURCE production-docker-image

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
20 changes: 20 additions & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.

FROM ubuntu:22.04

RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

RUN pip install --upgrade pip \
&& pip install wheel packaging ninja setuptools>=49.4.0 numpy

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

CMD ["/bin/bash"]
19 changes: 1 addition & 18 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -78,23 +78,6 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

# build cupy
RUN if [ "$BUILD_CUPY" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
&& cd cupy \
&& pip install mpi4py-mpich \
&& pip install scipy==1.9.3 \
&& pip install cython==0.29.* \
&& env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
&& export CUPY_INSTALL_USE_HIP=1 \
&& export ROCM_HOME=/opt/rocm \
&& export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
&& pip install . \
&& cd ..; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
Expand All @@ -110,6 +93,6 @@ RUN cd /app \
&& cd ..

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all]
RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3

CMD ["/bin/bash"]
Loading

3 comments on commit 3d151aa

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bigger_is_better

Benchmark suite Current: 3d151aa Previous: 5d256f5 Ratio
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.24143119424980988 prompts/s 0.24238101845555668 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 31.386055252475284 tokens/s 31.509532399222365 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9001633751519507 prompts/s 0.9222732575656754 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 117.0212387697536 tokens/s 119.89552348353779 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7.442484720108105 prompts/s 7.460735863089987 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3817.994661415458 tokens/s 3827.3574977651633 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 10.56511636525732 prompts/s 10.905722644431853 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1373.4651274834514 tokens/s 1417.743943776141 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 13.873830194719073 prompts/s 13.873960314933052 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3565.574360042802 tokens/s 3565.6078009377943 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11.92040265266792 prompts/s 12.108480276642714 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1549.6523448468297 tokens/s 1574.1024359635528 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 24.21516471769031 prompts/s 24.28415118487307 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3123.7562485820504 tokens/s 3132.655502848626 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.4556259124360813 prompts/s 2.4571297792211517 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 758.3627651179269 tokens/s 758.8271992842708 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 563.9426762106542 tokens/s 564.0980261944751 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 21.13092831346938 prompts/s 22.187746647250002 prompts/s 1.05
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2747.0206807510194 tokens/s 2884.4070641425 tokens/s 1.05
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.879427381457162 prompts/s 3.886940771478625 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3976.4130659935913 tokens/s 3984.1142907655903 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.945421236474008 prompts/s 3.9828121056625134 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1515.041754806019 tokens/s 1529.399848574405 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11.931318047002925 prompts/s 12.13488564141119 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1551.0713461103803 tokens/s 1577.5351333834546 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.0370412235347892 prompts/s 2.0384512363578784 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4173.8974670227835 tokens/s 4176.786583297292 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.93680818167011 prompts/s 1.9387578726511672 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3968.5199642420553 tokens/s 3972.5148810622413 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5.740806571709462 prompts/s 5.902264373255192 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 746.3048543222301 tokens/s 767.294368523175 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 24.215979804646548 prompts/s 25.088267424158126 prompts/s 1.04
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1574.0386873020257 tokens/s 1630.7373825702782 tokens/s 1.04
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.41471942239827 prompts/s 3.4476110965203293 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 443.9135249117751 tokens/s 448.18944254764284 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9479735055401838 prompts/s 0.9485305349718826 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 279.5131480202083 tokens/s 279.67739000490946 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 204.63904064095945 tokens/s 204.74980127903055 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 13.324599679946262 prompts/s 13.352467062897055 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 866.0989791965071 tokens/s 867.9103590883086 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.581583909802736 prompts/s 6.617185367210194 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 855.6059082743557 tokens/s 860.2340977373252 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 13.896956458289448 prompts/s 13.85864605073523 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3571.5178097803882 tokens/s 3561.672035038954 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 8.177099489963267 prompts/s 8.182555497020477 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4194.852038351156 tokens/s 4197.650969971505 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 15.91869545371545 prompts/s 15.90827947827011 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4091.1047316048707 tokens/s 4088.4278259154185 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.58836784096709 prompts/s 3.6458619099040686 prompts/s 1.02
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1137.117885124061 tokens/s 1155.3371806295002 tokens/s 1.02
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 820.241082314394 tokens/s 833.3759765150852 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.24142575822155676 prompts/s 0.24244688249619376 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 31.38534856880238 tokens/s 31.518094724505186 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.4749942137386815 prompts/s 0.47516367776809365 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 125.55997045968307 tokens/s 125.60476658121787 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 100.1161137437477 tokens/s 99.04628475183323 tokens/s 0.99
{"name": "request_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5.797046926822314 prompts/s 5.87975536866905 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - 2:4 Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2710.4672611050414 tokens/s 2749.138420174901 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4.274700445404838 prompts/s 4.318046045498215 prompts/s 1.01
{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1354.609824144339 tokens/s 1368.345611357929 tokens/s 1.01
{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 853.0791694870679 tokens/s 868.194973999962 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.036066805954376 prompts/s 2.0376904303944783 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4171.900885400516 tokens/s 4175.227691878286 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.800100878310634 prompts/s 6.9029220291995745 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 884.0131141803824 tokens/s 897.3798637959447 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.9992823136293956 prompts/s 1.9983647261581747 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4096.529460626632 tokens/s 4094.6493238981 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.0196892597058707 prompts/s 1.0260038376978546 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 132.55960376176319 tokens/s 133.38049890072108 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 23.222352163607585 prompts/s 23.863569112048754 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2995.6834291053788 tokens/s 3078.400415454289 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4.860799799667988 prompts/s 4.954289642958029 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 631.9039739568385 tokens/s 644.0576535845438 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.9932820837494252 prompts/s 2.007537129014732 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 259.1266708874253 tokens/s 260.97982677191516 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9682670119511764 prompts/s 0.9683190072609545 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 288.71463006689527 tokens/s 288.73013385504726 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 210.08166602634023 tokens/s 210.09294727538511 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4.0353072272457124 prompts/s 4.0345483051593805 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4136.189907926855 tokens/s 4135.412012788364 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.205889620860116 prompts/s 6.302927217364502 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2901.625751129356 tokens/s 2946.9966497509463 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7.267537283282574 prompts/s 7.398279920780004 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3398.0097321716003 tokens/s 3459.1397597598984 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.4179693546746175 prompts/s 3.4511180854075083 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 444.3360161077003 tokens/s 448.64535110297606 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.24138168362598794 prompts/s 0.24240453590500213 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 31.379618871378433 tokens/s 31.51258966765028 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.7543979627762774 prompts/s 3.7562769453806886 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3848.2579118456847 tokens/s 3850.183869015206 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.8081137137950178 prompts/s 1.822401442100299 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 235.0547827933523 tokens/s 236.91218747303887 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.796479413557794 prompts/s 6.912159148592688 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 883.5423237625132 tokens/s 898.5806893170494 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14.182162014136532 prompts/s 14.196516357041775 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3644.8156376330885 tokens/s 3648.504703759736 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.756234320721016 prompts/s 3.7549806062690445 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3850.1401787390414 tokens/s 3848.855121425771 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7.239799578278059 prompts/s 7.276742675255298 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 941.1739451761476 tokens/s 945.9765477831888 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.786646061961246 prompts/s 0.7963773368324081 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 102.26398805496197 tokens/s 103.52905378821305 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5.770573970274432 prompts/s 5.773227530175982 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2960.3044467507834 tokens/s 2961.6657229802786 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.0371602765687693 prompts/s 2.0370743705794987 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4174.1414066894085 tokens/s 4173.965385317393 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.3027305602489907 prompts/s 2.308000394860425 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 711.1446031531616 tokens/s 712.7720686100955 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 463.02077982521234 tokens/s 463.28645792726303 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.7261278855439541 prompts/s 1.737335920174266 prompts/s 1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 562.9547455836104 tokens/s 566.610104109848 tokens/s 1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 392.31434582642987 tokens/s 394.8871888640364 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.2676304237452567 prompts/s 3.295618228083936 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1540.2498823191067 tokens/s 1553.4423816990125 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 12.007680633364146 prompts/s 12.309064576659251 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1560.998482337339 tokens/s 1600.1783949657026 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 13.899586744264298 prompts/s 13.821162696155543 prompts/s 0.99
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3572.1937932759242 tokens/s 3552.0388129119747 tokens/s 0.99
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 9.405814919310409 prompts/s 9.409021876126587 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2417.294434262775 tokens/s 2418.118622164533 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 8.04542699094911 prompts/s 8.079534158631766 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4127.304046356894 tokens/s 4144.801023378096 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.9573920709465593 prompts/s 3.0583045890959366 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 384.4609692230527 tokens/s 397.57959658247177 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.811911799769203 prompts/s 6.922962631124627 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 885.5485339699965 tokens/s 899.9851420462015 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 25.869353864604488 prompts/s 26.083618499604615 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1681.5080011992916 tokens/s 1695.4352024743 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.2321775255171805 prompts/s 2.2350940323441253 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 689.3559446137192 tokens/s 690.2566396953952 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 512.418672757724 tokens/s 513.0703053126587 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9797160743877232 prompts/s 0.9804741017480871 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 288.87255025346815 tokens/s 289.0960571474293 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 211.44232317435845 tokens/s 211.62553012130712 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.080173908874548 prompts/s 2.0818269797194646 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2048,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4262.276339283949 tokens/s 4265.663481445183 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.7515278155628633 prompts/s 3.7886955531419426 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 487.69861602317224 tokens/s 492.5304219084525 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.46642354991892127 prompts/s 0.4667121955992088 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 126.1271468787419 tokens/s 126.2052005193007 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 118.19172754945465 tokens/s 118.25553612092752 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.507829630940218 prompts/s 6.6028805678878815 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 846.0178520222283 tokens/s 858.3744738254245 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 4.508207550328998 prompts/s 4.5663181662764245 prompts/s 1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1428.6058906237563 tokens/s 1447.0205637113363 tokens/s 1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1030.5161365712045 tokens/s 1043.7690064474652 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 10.62911505192549 prompts/s 10.962920090017784 prompts/s 1.03
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1381.7849567503138 tokens/s 1425.1796117023118 tokens/s 1.03
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.4918963409346558 prompts/s 0.4918959085118237 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 130.02787876266692 tokens/s 130.02776445601546 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 121.9837339339155 tokens/s 121.99018531093228 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.46422524116609 prompts/s 0.46460681411479277 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 122.71330024984422 tokens/s 122.81416524310433 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 115.13714431401364 tokens/s 115.23487941551167 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9839163607924347 prompts/s 0.9839384614861079 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 290.111018700852 tokens/s 290.11753516405054 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 212.40130452546552 tokens/s 212.39623609126286 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 25.848835029286406 prompts/s 25.412318682202837 prompts/s 0.98
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1680.1742769036164 tokens/s 1651.8007143431844 tokens/s 0.98
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.8074398854092264 prompts/s 1.8218718456994156 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 234.96718510319943 tokens/s 236.84333994092404 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 12.30110028060587 prompts/s 12.324311091322674 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1586.8419361981573 tokens/s 1589.836130780625 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.442067945588395 prompts/s 3.505890035852268 prompts/s 1.02
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1090.7569112775063 tokens/s 1110.9814934612252 tokens/s 1.02
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 786.781006866704 tokens/s 801.362320834968 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.237698791428178 prompts/s 2.24610852028914 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 691.0610587607928 tokens/s 693.6582072924942 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 513.6324297892583 tokens/s 515.2483101202477 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.864500911228607 prompts/s 1.884797463343204 prompts/s 1.01
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 592.4091175252728 tokens/s 598.8579545596784 tokens/s 1.01
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 417.49531504048724 tokens/s 422.0237434855346 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9839288709868197 prompts/s 0.9839009120096995 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 290.1147073733671 tokens/s 290.1064635757666 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 212.49255874121687 tokens/s 212.39468987553386 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.4164448228296043 prompts/s 3.450353520698302 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 16,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 444.1378269678485 tokens/s 448.54595769077923 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.916767730797845 prompts/s 0.9225130310962376 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 119.17980500371985 tokens/s 119.92669404251089 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.4918854123470756 prompts/s 0.49190098937367166 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 130.02498989982595 tokens/s 130.02910753103637 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 121.99741997032169 tokens/s 121.99800404452888 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 17.93689230222319 prompts/s 18.701195224198248 prompts/s 1.04
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 32\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2331.7959992890146 tokens/s 2431.1553791457723 tokens/s 1.04
{"name": "request_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1.8098245039587273 prompts/s 1.8229829345877713 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 8,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 235.27718551463457 tokens/s 236.98778149641026 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.4503719750764987 prompts/s 3.4792191849472083 prompts/s 1.01
{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1093.3883751819917 tokens/s 1102.5297675179208 tokens/s 1.01
{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 788.6538225912187 tokens/s 794.7487604996643 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7.447690531210302 prompts/s 7.46030745286681 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3820.665242510885 tokens/s 3827.1377233206736 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.7733226764923229 prompts/s 0.789525601336801 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 100.53194794400198 tokens/s 102.63832817378415 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6.413429458334058 prompts/s 6.502117596298573 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 64\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 833.7458295834275 tokens/s 845.2752875188145 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 25.85235472175337 prompts/s 26.032054467352 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1680.4030569139688 tokens/s 1692.0835403778801 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 23.94245344919835 prompts/s 24.15214323728174 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3088.576494946587 tokens/s 3115.6264776093444 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.3411473200327686 prompts/s 2.346723103195728 prompts/s 1.00
{"name": "input_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 723.0087230213198 tokens/s 724.7306735495926 tokens/s 1.00
{"name": "output_throughput", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 536.8937541382348 tokens/s 538.0316413876157 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.9129919460580217 prompts/s 0.9194572654591827 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 118.68895298754282 tokens/s 119.52944450969376 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2.782061675562657 prompts/s 2.828263606943077 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 4\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 361.6680178231454 tokens/s 367.6742689026 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 24.204976065063367 prompts/s 24.321787560348138 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3122.4419123931743 tokens/s 3137.51059528491 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.753542771995297 prompts/s 3.7583169769175058 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3847.3813412951795 tokens/s 3852.274901340443 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7.448194548287643 prompts/s 7.453090157218175 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - 2:4 Sparse (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 512,\n \"output-len\": 1,\n \"num-prompts\": 1,\n \"sparsity\": \"semi_structured_sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3820.923803271561 tokens/s 3823.4352506529235 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 48.82346198140881 prompts/s 49.86026659890417 prompts/s 1.02
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 64,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3173.525028791573 tokens/s 3240.917328928771 tokens/s 1.02
{"name": "request_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 0.2574018688618385 prompts/s 0.25877241153393077 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine decode throughput - Dense (synthetic)\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 2,\n \"output-len\": 128,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 33.462242952039006 tokens/s 33.640413499411 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5.785458152194025 prompts/s 5.865371483465854 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - Dense (with dataset)\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2705.048813639838 tokens/s 2742.4130908092943 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3.4868586260866627 prompts/s 3.4900366416337203 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 1024,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3574.0300917388295 tokens/s 3577.2875576745632 tokens/s 1.00
{"name": "request_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5.797948384736882 prompts/s 5.875082478847883 prompts/s 1.01
{"name": "token_throughput", "description": "VLLM Engine throughput - Sparse (with dataset)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"dataset\": \"sharegpt\",\n \"output-len\": 128,\n \"num-prompts\": 1000,\n \"sparsity\": \"sparse_w16a16\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2710.888746767577 tokens/s 2746.953563810116 tokens/s 1.01
{"name": "request_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 28.852220857086547 prompts/s 28.97109811558995 prompts/s 1.00
{"name": "token_throughput", "description": "VLLM Engine prefill throughput - Dense (synthetic)\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 128,\n \"output-len\": 1,\n \"num-prompts\": 1\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3721.9364905641646 tokens/s 3737.2716569111035 tokens/s 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

smaller_is_better

Benchmark suite Current: 3d151aa Previous: 5d256f5 Ratio
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2351.4776980000534 ms 2298.491450000256 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 115.07024195999597 ms 114.78044986799553 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 80.43757549967268 ms 80.56627499991009 ms 1.00
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 17.268589149404406 ms 16.997969205969508 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 15.421280329068123 ms 15.094163741772734 ms 1.02
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5999.9092084999575 ms 5931.841860499844 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 126.8381950299772 ms 127.6032645899977 ms 0.99
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 83.53869450002094 ms 89.6591530004116 ms 0.93
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 40.28193766299183 ms 39.93344069689709 ms 1.01
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 40.12230005124164 ms 39.744499879037235 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 53790.03820749995 ms 50117.24847950018 ms 1.07
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 32432.77763132 ms 29603.981275650014 ms 1.10
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 32120.005966499775 ms 29778.72212850025 ms 1.08
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 96.73603420276847 ms 94.97658300936139 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 99.51846112902473 ms 97.70854773940782 ms 1.02
{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5053.72388250089 ms 5012.179495000055 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 87.8623702732754 ms 87.426237500149 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 59.482906999619445 ms 57.789587999650394 ms 1.03
{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 36.13592613332169 ms 36.19402189485589 ms 1.00
{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 30.773394148962605 ms 30.760241742148754 ms 1.00
{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14154.261130999657 ms 12107.788427500054 ms 1.17
{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 168.9256128733226 ms 162.72313893135774 ms 1.04
{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 138.95462949767534 ms 128.43876700026158 ms 1.08
{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 124.25997606894775 ms 107.88996677468617 ms 1.15
{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 112.44488433213928 ms 96.48709741669009 ms 1.17
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6006.870697500744 ms 5974.363770000309 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 121.36992918332301 ms 121.22592117333625 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 83.03905350021523 ms 80.47201550016325 ms 1.03
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 39.3729859785975 ms 39.06469746379868 ms 1.01
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 39.04064974857238 ms 38.7208552209086 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5765.572242000417 ms 5675.264227499611 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 116.4544837292924 ms 116.61219194392955 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 86.1844965002092 ms 88.89358400028868 ms 0.97
{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 52.46626782183014 ms 51.640160856100806 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 45.50704121206209 ms 45.15616100629305 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 71896.19401000073 ms 70111.49535349978 ms 1.03
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 52730.1389921173 ms 51347.631983144005 ms 1.03
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 64738.412784499815 ms 62929.77782849994 ms 1.03
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 66.48159001757418 ms 66.66126375049774 ms 1.00
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 64.78268955756674 ms 64.52694111087425 ms 1.00
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 7571.667430999696 ms 7362.834812499386 ms 1.03
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 156.26068746138358 ms 154.61800476534094 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 131.80199599992193 ms 129.36881800033007 ms 1.02
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 56.2828019612381 ms 55.27959334296553 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 55.715598581575556 ms 54.54445435162978 ms 1.02
{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 3867.8116559995033 ms 3832.0554109986915 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 144.32839204327442 ms 145.28621267667404 ms 0.99
{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 96.43640349986526 ms 98.93651149923244 ms 0.97
{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 25.83208101018101 ms 25.443178098120548 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 25.13378567925105 ms 24.931781164987555 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6070.155346999854 ms 6056.275111499872 ms 1.00
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 105.49893929326572 ms 101.05402738006887 ms 1.04
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 66.88622100045905 ms 67.27566650033623 ms 0.99
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 34.40702931351279 ms 34.30688045566592 ms 1.00
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 34.929082047968755 ms 34.78871438648948 ms 1.00
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 12421.510741999555 ms 10733.578026000032 ms 1.16
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 196.09448883065488 ms 194.6180394186813 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 154.1681745002279 ms 148.89166299963108 ms 1.04
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 95.61047598259354 ms 81.96528384593707 ms 1.17
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 98.40706089880393 ms 83.47636292423746 ms 1.18
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2017.667531999905 ms 1995.3663525002412 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 116.89579616000022 ms 116.09958058663324 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 86.73421899948153 ms 86.66624400029832 ms 1.00
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11.451387488055541 ms 11.322725361297303 ms 1.01
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11.429339942495401 ms 11.305585780190128 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 6341.659947500375 ms 6299.981730000354 ms 1.01
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 110.80191320000571 ms 110.18293989333566 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 69.47407150028084 ms 69.29191799986256 ms 1.00
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 36.06700485024417 ms 35.82895880896143 ms 1.01
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - teknium/OpenHermes-2.5-Mistral-7B\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 36.79354970044416 ms 36.506332861635215 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 2077.9450419995555 ms 2036.3052165002955 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 136.28522477332808 ms 136.03586736004218 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 104.86377300003369 ms 102.57559550063888 ms 1.02
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14.820671679247242 ms 14.56990023969321 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 13.437915755196407 ms 13.239858912022912 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 60706.566852499236 ms 56577.966683999875 ms 1.07
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 19808.064829272684 ms 17532.284571051987 ms 1.13
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14854.807530500693 ms 12446.751371499886 ms 1.19
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 255.75387223560665 ms 250.3874454559828 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 272.34452073326787 ms 265.98392577690976 ms 1.02
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11470.182037000086 ms 11176.347484000871 ms 1.03
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 194.57400956401156 ms 195.4260834372908 ms 1.00
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 167.22216949983704 ms 171.80511850074254 ms 0.97
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 88.3449534679851 ms 85.44794902806204 ms 1.03
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 90.8285186219457 ms 86.38680994022593 ms 1.05
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 248373.33666350058 ms 243817.96864300032 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 232949.291254374 ms 228636.75700275466 ms 1.02
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 235470.11378599927 ms 231814.10013550022 ms 1.02
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 68.84879945218638 ms 67.6634427048007 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 66.46263290652307 ms 65.7771695800051 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1786.582091001037 ms 1743.1110670004273 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 92.88359695337324 ms 91.87637143670752 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 57.75778449969948 ms 57.173636500010616 ms 1.01
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 12.322249643731835 ms 12.071613849582373 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 11.547213422015203 ms 11.251413051084208 ms 1.03
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 1903.2035004993304 ms 1869.9571789998117 ms 1.02
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 78.7932918666047 ms 78.12671420667661 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 37.84798450033122 ms 39.117287500175735 ms 0.97
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 10.836956241126932 ms 10.639487958875774 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"150,0.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 10.931278148527122 ms 10.700277322754006 ms 1.02
{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 68495.39479750002 ms 65910.23518449947 ms 1.04
{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 30068.90573979466 ms 28790.14074611335 ms 1.04
{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 28277.71570850018 ms 27066.89145799919 ms 1.04
{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 193.86638649773676 ms 191.71991239738932 ms 1.01
{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 204.25048554519898 ms 202.4104537097768 ms 1.01
{"name": "median_request_latency", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 5598.584875999222 ms 5456.0780435003835 ms 1.03
{"name": "mean_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 186.2009436306471 ms 184.2748047346516 ms 1.01
{"name": "median_ttft_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 175.9402850002516 ms 175.41268799959653 ms 1.00
{"name": "mean_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 41.987163951343035 ms 41.265894817093105 ms 1.02
{"name": "median_tpot_ms", "description": "VLLM Serving - Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50\nmax-model-len - 4096\nsparsity - sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"750,2.5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 39.02829481995171 ms 38.468251390749664 ms 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'smaller_is_better'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.10.

Benchmark suite Current: 3d151aa Previous: 5d256f5 Ratio
{"name": "median_request_latency", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14154.261130999657 ms 12107.788427500054 ms 1.17
{"name": "mean_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 124.25997606894775 ms 107.88996677468617 ms 1.15
{"name": "median_tpot_ms", "description": "VLLM Serving - 2:4 Sparse\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4\nmax-model-len - 4096\nsparsity - semi_structured_sparse_w16a16\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 112.44488433213928 ms 96.48709741669009 ms 1.17
{"name": "median_request_latency", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 12421.510741999555 ms 10733.578026000032 ms 1.16
{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 95.61047598259354 ms 81.96528384593707 ms 1.17
{"name": "median_tpot_ms", "description": "VLLM Serving - Dense\nmodel - neuralmagic/OpenHermes-2.5-Mistral-7B-marlin\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 98.40706089880393 ms 83.47636292423746 ms 1.18
{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 19808.064829272684 ms 17532.284571051987 ms 1.13
{"name": "median_ttft_ms", "description": "VLLM Serving - Dense\nmodel - TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"1500,5\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.1.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.1.2+cu121"} 14854.807530500693 ms 12446.751371499886 ms 1.19

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.