Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Benchmarking : Misc updates (#95)
Browse files Browse the repository at this point in the history
SUMMARY:
Fixes and Quality-of-life changes
- Fix the vllm engine `temperature` to 0.0 so the text generation is
deterministic
 - Fix time-per-output-token metric computation
- Add num_warmup_prompts and log_model_io options to benchmark
throughput

TEST PLAN:
Manual testing

---------

Co-authored-by: Varun Sundar Rabindranath <[email protected]>
  • Loading branch information
varun-sundar-rabindranath and Varun Sundar Rabindranath authored Mar 11, 2024
1 parent 3ae527f commit aebf20b
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 31 deletions.
4 changes: 3 additions & 1 deletion neuralmagic/benchmarks/scripts/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ async def async_request_vllm(
"n": 1,
"best_of": request_func_input.best_of,
"use_beam_search": request_func_input.use_beam_search,
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
# TODO (varun) : Make temperature configurable
#"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"ignore_eos": True,
Expand Down
9 changes: 5 additions & 4 deletions neuralmagic/benchmarks/scripts/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_benchmark_io
# TODO (move this to scripts)
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, print_serving_request_io
from .datasets_registry import get_dataset, DatasetArgs

from neuralmagic.benchmarks.scripts.backend_request_func import (
Expand Down Expand Up @@ -100,7 +99,9 @@ def calculate_metrics(
total_output += output_len
total_input += input_requests[i][1]
latencies.append(outputs[i].latency)
tpots.append((outputs[i].latency - outputs[i].ttft) / output_len)
if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
ttfts.append(outputs[i].ttft)
completed += 1

Expand Down Expand Up @@ -167,7 +168,7 @@ async def benchmark(backend: str, api_url: str, model_id: str,

# Dump model i/o
if log_model_io:
print_benchmark_io(outputs)
print_serving_request_io(input_requests, outputs)

metrics = calculate_metrics(
input_requests=input_requests,
Expand Down
54 changes: 33 additions & 21 deletions neuralmagic/benchmarks/scripts/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pathlib import Path
from typing import List, Optional, Tuple
from transformers import AutoTokenizer
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus
from .common import instantiate_benchmark_results_dict, generate_synthetic_requests, warmup_vllm_engine, num_available_gpus, print_request_outputs
from .datasets_registry import get_dataset, DatasetArgs


Expand All @@ -25,21 +25,21 @@ def get_tensor_parallel_size(args: argparse.Namespace) -> int:
return tensor_parallel_size


def run_vllm(
requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
sparsity: Optional[str],
) -> float:
def run_vllm(requests: List[Tuple[str, int, int]],
model: str,
tokenizer: str,
quantization: Optional[str],
tensor_parallel_size: int,
seed: int,
n: int,
use_beam_search: bool,
trust_remote_code: bool,
dtype: str,
max_model_len: Optional[int],
enforce_eager: bool,
sparsity: Optional[str],
num_warmup_prompts: int,
log_model_io: bool = False) -> float:
from vllm import LLM, SamplingParams
llm = LLM(
model=model,
Expand All @@ -53,13 +53,15 @@ def run_vllm(
enforce_eager=enforce_eager,
)

warmup_vllm_engine(engine=llm, model=model, num_prompts=1000)
warmup_vllm_engine(engine=llm, model=model, num_prompts=num_warmup_prompts)

# Add the requests to the engine.
for prompt, _, output_len in requests:
sampling_params = SamplingParams(
n=n,
temperature=0.0 if use_beam_search else 1.0,
# TODO (varun) Make temperature configurable
#temperature=0.0 if use_beam_search else 1.0,
temperature=0.0,
top_p=1.0,
use_beam_search=use_beam_search,
ignore_eos=True,
Expand All @@ -74,9 +76,12 @@ def run_vllm(

start = time.perf_counter()
# FIXME(woosuk): Do not use internal method.
llm._run_engine(use_tqdm=True)
outputs = llm._run_engine(use_tqdm=True)
end = time.perf_counter()

if log_model_io:
print_request_outputs(outputs)

return end - start


Expand All @@ -96,7 +101,7 @@ def main(args: argparse.Namespace):
num_samples=args.num_prompts,
max_len=2048,
seed=42,
))
fixed_output_len=args.output_len))
else:
# Make a synthetic dataset.
requests = generate_synthetic_requests(args.input_len, args.output_len,
Expand All @@ -114,7 +119,9 @@ def main(args: argparse.Namespace):
args.dtype,
args.max_model_len,
args.enforce_eager,
sparsity=args.sparsity)
sparsity=args.sparsity,
num_warmup_prompts=args.num_warmup_prompts,
log_model_io=args.log_model_io)

total_prompt_tokens = sum(prompt_len for _, prompt_len, _ in requests)
total_output_tokens = sum(output_len for _, _, output_len in requests)
Expand Down Expand Up @@ -189,10 +196,15 @@ def main(args: argparse.Namespace):
type=int,
default=1000,
help="Number of prompts to process.")
parser.add_argument("--num-warmup-prompts",
type=int,
default=1000,
help="Number of prompts to do warmups with.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
parser.add_argument("--log-model-io", action="store_true")
parser.add_argument(
'--max-model-len',
type=int,
Expand Down
28 changes: 23 additions & 5 deletions neuralmagic/benchmarks/scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import get_tokenizer
from .datasets_registry import SHAREGPT_PATH, SHAREGPT_DOWNLOAD_STR
from .backend_request_func import RequestFuncInput, async_request_vllm
from .backend_request_func import RequestFuncInput, RequestFuncOutput, async_request_vllm
from ...tools.call_cmd import call_cmd


Expand Down Expand Up @@ -204,9 +204,27 @@ def instantiate_benchmark_results_dict(benchmarking_script_name: str,
return result_dict


def print_benchmark_io(results: List[RequestOutput]) -> None:
def format_io_log(prompt: str, output_text: str, n_prompt_tokens: int,
n_output_tokens: int) -> str:
return f"\n=== Prompt ({n_prompt_tokens}) ==\n{prompt}\n==== output({n_output_tokens}) ==\n{output_text}\n"


def print_request_outputs(results: List[RequestOutput]) -> None:
for result in results:
output = result.outputs[0]
print(
f"\n\n inputs({len(result.prompt_token_ids)}): {result.prompt}\n output({len(output.token_ids)}): {output.text}"
)
io_log = format_io_log(result.prompt, output.text,
len(result.prompt_token_ids),
len(output.token_ids))
print(f"\n{io_log}")


def print_serving_request_io(inputs: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput]) -> None:
"""
inputs: list of tuples where the tuple is [prompt, prompt_length, output_length],
outputs: list of RequestFuncOutput that is the output from the serving case (benchmark_serving.py)
Format and print the inputs and outputs.
"""
for i, o in zip(inputs, outputs):
io_log = format_io_log(i[0], o.generated_text, i[1], i[2])
print(f"\n{io_log}")

0 comments on commit aebf20b

Please sign in to comment.