Skip to content

Commit

Permalink
update run script
Browse files Browse the repository at this point in the history
Signed-off-by: Chendi Xue <[email protected]>
  • Loading branch information
xuechendi committed Jan 30, 2025
1 parent 6326852 commit ea681df
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 0 deletions.
57 changes: 57 additions & 0 deletions scripts/run_example_tp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from vllm import LLM, SamplingParams

import argparse
import os

# Parse the command-line arguments.
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="/software/data/DeepSeek-R1/", help="The model path.")
parser.add_argument("--tokenizer", type=str, default="deepseek-ai/DeepSeek-R1", help="The model path.")
#parser.add_argument("--model", type=str, default="/data/models/DeepSeek-R1-bf16-small/", help="The model path.")
#parser.add_argument("--tokenizer", type=str, default="opensourcerelease/DeepSeek-R1-bf16", help="The model path.")
parser.add_argument("--tp_size", type=int, default=8, help="The number of threads.")
args = parser.parse_args()

os.environ["VLLM_SKIP_WARMUP"] = "true"
os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"


# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0, max_tokens=50)
model = args.model
if args.tp_size == 1:
llm = LLM(
model=model,
tokenizer=args.tokenizer,
trust_remote_code=True,
dtype="bfloat16",
)
else:
llm = LLM(
model=model,
tokenizer=args.tokenizer,
tensor_parallel_size=args.tp_size,
distributed_executor_backend='ray',
trust_remote_code=True,
max_model_len=1024,
dtype="bfloat16",
)

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
64 changes: 64 additions & 0 deletions scripts/run_static-online.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash
tp_parrallel=8
bs=32
in_len=1024
out_len=1024
multi_step=1
total_len=$((in_len + out_len))
VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128))

model="/software/data/DeepSeek-R1/"
tokenizer="/software/data/DeepSeek-R1/"
model_name="DeepSeek-R1"

HABANA_VISIBLE_DEVICES="ALL" \
PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
RAY_IGNORE_UNHANDLED_ERRORS="1" \
VLLM_PROMPT_BS_BUCKET_MIN=1 \
VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
VLLM_DECODE_BS_BUCKET_MIN=${bs} \
VLLM_DECODE_BS_BUCKET_MAX=${bs} \
VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
python -m vllm.entrypoints.openai.api_server \
--port 8080 \
--model ${model} \
--tensor-parallel-size ${tp_parrallel} \
--max-num-seqs ${bs} \
--disable-log-requests \
--dtype bfloat16 \
--use-v2-block-manager \
--num_scheduler_steps ${multi_step}\
--max-model-len 2048 \
--max-num-batched-tokens 2048 \
--distributed_executor_backend ray \
--trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
pid=$(($!-1))

until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
n=$((n+1))
if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
break
fi
sleep 5s
done
sleep 5s
echo ${pid}

num_prompts=32
request_rate=1
start_time=$(date +%s)
echo "Start to benchmark"
python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 \
--save-result| tee benchmark_logs/static-online-gaudi3-TPparallel${tp_parrallel}-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}.log
end_time=$(date +%s)
echo "Time elapsed: $((end_time - start_time))s"

sleep 10

kill ${pid}
#--backend openai-chat --endpoint "v1/chat/completions"

0 comments on commit ea681df

Please sign in to comment.