update run script

Signed-off-by: Chendi Xue <[email protected]>
HabanaAI · Jan 30, 2025 · ea681df · ea681df
1 parent 6326852
commit ea681df
Show file tree

Hide file tree

Showing 2 changed files with 121 additions and 0 deletions.
diff --git a/scripts/run_example_tp.py b/scripts/run_example_tp.py
@@ -0,0 +1,57 @@
+from vllm import LLM, SamplingParams
+
+import argparse
+import os
+
+# Parse the command-line arguments.
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="/software/data/DeepSeek-R1/", help="The model path.")
+parser.add_argument("--tokenizer", type=str, default="deepseek-ai/DeepSeek-R1", help="The model path.")
+#parser.add_argument("--model", type=str, default="/data/models/DeepSeek-R1-bf16-small/", help="The model path.")
+#parser.add_argument("--tokenizer", type=str, default="opensourcerelease/DeepSeek-R1-bf16", help="The model path.")
+parser.add_argument("--tp_size", type=int, default=8, help="The number of threads.")
+args = parser.parse_args()
+
+os.environ["VLLM_SKIP_WARMUP"] = "true"
+os.environ["HABANA_VISIBLE_DEVICES"] = "ALL"
+os.environ["PT_HPU_ENABLE_LAZY_COLLECTIVES"] = "true"
+os.environ["VLLM_RAY_DISABLE_LOG_TO_DRIVER"] = "1"
+os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0, max_tokens=50)
+model = args.model
+if args.tp_size == 1:
+    llm = LLM(
+        model=model, 
+        tokenizer=args.tokenizer,
+        trust_remote_code=True,
+        dtype="bfloat16",
+    )
+else:
+    llm = LLM(
+        model=model, 
+        tokenizer=args.tokenizer,
+        tensor_parallel_size=args.tp_size,
+        distributed_executor_backend='ray',
+        trust_remote_code=True,
+        max_model_len=1024,
+        dtype="bfloat16",
+    )
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/scripts/run_static-online.sh b/scripts/run_static-online.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+tp_parrallel=8
+bs=32
+in_len=1024
+out_len=1024
+multi_step=1
+total_len=$((in_len + out_len))
+VLLM_DECODE_BLOCK_BUCKET_MIN=$((in_len * bs / 128))
+VLLM_DECODE_BLOCK_BUCKET_MAX=$((total_len * bs / 128))
+
+model="/software/data/DeepSeek-R1/"
+tokenizer="/software/data/DeepSeek-R1/"
+model_name="DeepSeek-R1"
+
+HABANA_VISIBLE_DEVICES="ALL" \
+PT_HPU_ENABLE_LAZY_COLLECTIVES="true" \
+VLLM_RAY_DISABLE_LOG_TO_DRIVER="1" \
+RAY_IGNORE_UNHANDLED_ERRORS="1" \
+VLLM_PROMPT_BS_BUCKET_MIN=1 \
+VLLM_PROMPT_BS_BUCKET_MAX=${bs} \
+VLLM_PROMPT_SEQ_BUCKET_MIN=${in_len} \
+VLLM_PROMPT_SEQ_BUCKET_MAX=${in_len} \
+VLLM_DECODE_BS_BUCKET_MIN=${bs} \
+VLLM_DECODE_BS_BUCKET_MAX=${bs} \
+VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN} \
+VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX} \
+python -m vllm.entrypoints.openai.api_server \
+    --port 8080 \
+    --model ${model} \
+    --tensor-parallel-size ${tp_parrallel} \
+    --max-num-seqs ${bs} \
+    --disable-log-requests \
+    --dtype bfloat16 \
+    --use-v2-block-manager \
+    --num_scheduler_steps ${multi_step}\
+    --max-model-len 2048 \
+    --max-num-batched-tokens 2048 \
+    --distributed_executor_backend ray \
+    --trust_remote_code 2>&1 | tee benchmark_logs/serving.log &
+pid=$(($!-1))
+
+until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do
+    n=$((n+1))
+    if grep -q "Uvicorn running on" benchmark_logs/serving.log; then
+        break
+    fi
+    sleep 5s
+done
+sleep 5s
+echo ${pid}
+
+num_prompts=32
+request_rate=1
+start_time=$(date +%s)
+echo "Start to benchmark"
+python benchmarks/benchmark_serving.py --backend vllm --model ${model} --tokenizer ${tokenizer} --dataset-name sonnet --dataset-path benchmarks/sonnet.txt --request-rate ${request_rate} --num-prompts ${num_prompts} --port 8080 --sonnet-input-len ${in_len} --sonnet-output-len ${out_len} --sonnet-prefix-len 100 \
+--save-result| tee benchmark_logs/static-online-gaudi3-TPparallel${tp_parrallel}-multistep${multi_step}_nprompt${num_prompts}_rrate${request_rate}_bs${bs}_i${in_len}_o${out_len}.log
+end_time=$(date +%s)
+echo "Time elapsed: $((end_time - start_time))s"
+
+sleep 10
+
+kill ${pid}
+#--backend openai-chat --endpoint "v1/chat/completions"