vllm-project · WoosukKwon · Sep 25, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -12,8 +12,8 @@
                           PreTrainedTokenizerBase)
 
 from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+# from vllm.entrypoints.openai.api_server import (
+#     build_async_engine_client_from_engine_args)
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -8,10 +8,10 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, logprobs=5)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="meta-llama/Llama-3.1-8B")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
@@ -20,3 +20,4 @@
     prompt = output.prompt
     generated_text = output.outputs[0].text
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(output.outputs[0])
diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -2,13 +2,12 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.llm_engine_v2 import LLMEngine
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.model_executor.models import ModelRegistry
-from vllm.outputs import (CompletionOutput, EmbeddingOutput,
-                          EmbeddingRequestOutput, RequestOutput)
+from vllm.outputs_v2 import CompletionOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
@@ -25,8 +24,6 @@
     "SamplingParams",
     "RequestOutput",
     "CompletionOutput",
-    "EmbeddingOutput",
-    "EmbeddingRequestOutput",
     "LLMEngine",
     "EngineArgs",
     "AsyncLLMEngine",