Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Enables offline /score for embedding models #12021

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
100 changes: 100 additions & 0 deletions tests/models/embedding/language/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
import math

import pytest
import torch
import torch.nn.functional as F

MODELS = [
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
"BAAI/bge-reranker-v2-m3", # Roberta
]

EMBEDDING_MODELS = [
"sentence-transformers/all-MiniLM-L12-v2",
]

TEXTS_1 = [
"What is the capital of France?",
"What is the capital of Germany?",
Expand Down Expand Up @@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.fixture(scope="module", params=EMBEDDING_MODELS)
def emb_model_name(request):
yield request.param


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pair = [TEXTS_1[0], TEXTS_2[0]]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0)
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])

assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)

assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)


@pytest.mark.parametrize("dtype", ["half"])
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):

text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]],
]

with hf_runner(emb_model_name, dtype=dtype,
is_sentence_transformer=True) as hf_model:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
hf_outputs = [
F.cosine_similarity(*map(torch.tensor, pair), dim=0)
for pair in hf_embeddings
]

with vllm_runner(emb_model_name,
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)

assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2

assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
160 changes: 116 additions & 44 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Tuple, Type, Union, cast, overload)

import cloudpickle
import torch
import torch.nn as nn
from tqdm import tqdm
from typing_extensions import TypeVar, deprecated
Expand Down Expand Up @@ -996,6 +997,107 @@ def classify(

return [ClassificationRequestOutput.from_base(item) for item in items]

def _embedding_score(
self,
tokenizer: AnyTokenizer,
text_1: List[Union[str, TextPrompt, TokensPrompt]],
text_2: List[Union[str, TextPrompt, TokensPrompt]],
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: bool = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> List[ScoringRequestOutput]:

DarkLight1337 marked this conversation as resolved.
Show resolved Hide resolved
encoded_output = self.encode(
text_1 + text_2,
use_tqdm=use_tqdm,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request)
encoded_output_1 = encoded_output[0:len(text_1)]
encoded_output_2 = encoded_output[len(text_1):]

if len(encoded_output_1) == 1:
encoded_output_1 = encoded_output_1 * len(encoded_output_2)

output_pairs = [(t1, t2)
for t1, t2 in zip(encoded_output_1, encoded_output_2)]

scores = []
scorer = torch.nn.CosineSimilarity(0)

for embed_1, embed_2 in output_pairs:
pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)

if (pad_token_id := getattr(tokenizer, "pad_token_id",
None)) is not None:
tokens = embed_1.prompt_token_ids + [
pad_token_id
] + embed_2.prompt_token_ids
else:
tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids

scores.append(
PoolingRequestOutput(
request_id=f"{embed_1.request_id}_{embed_2.request_id}",
outputs=pair_score,
prompt_token_ids=tokens,
finished=True))

items = self.engine_class.validate_outputs(scores,
PoolingRequestOutput)
return [ScoringRequestOutput.from_base(item) for item in items]

def _cross_encoding_score(
self,
tokenizer: Union[AnyTokenizer],
text_1: List[Union[str, TextPrompt, TokensPrompt]],
text_2: List[Union[str, TextPrompt, TokensPrompt]],
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: bool = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
) -> List[ScoringRequestOutput]:

if isinstance(tokenizer, MistralTokenizer):
raise ValueError(
"Score API is only enabled for `--task embed or score`")

if len(text_1) == 1:
text_1 = text_1 * len(text_2)

input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]

pooling_params = PoolingParams()

tokenization_kwargs: Dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens

parsed_prompts = []

for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)

outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)

return [ScoringRequestOutput.from_base(item) for item in items]

def score(
self,
text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]],
Expand Down Expand Up @@ -1047,25 +1149,20 @@ def score(

raise ValueError(" ".join(messages))

if not self.llm_engine.model_config.is_cross_encoder:
raise ValueError("Your model does not support cross encoding")
if self.llm_engine.model_config.task != "score":
raise ValueError("Score API is only enabled for `--task score`")

tokenizer = self.llm_engine.get_tokenizer()

if isinstance(tokenizer, MistralTokenizer):
if self.llm_engine.model_config.task not in ("embed", "score"):
raise ValueError(
"MistralTokenizer not supported for cross-encoding")
"Score API is only enabled for `--task embed or --task score`")

# the tokenizer for models such as
# "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing
# lists of tokens to the `text` and `text_pair` kwargs
tokenizer = self.llm_engine.get_tokenizer()

def ensure_str(prompt: SingletonPrompt):
if isinstance(prompt, dict):
if "multi_modal_data" in prompt:
raise ValueError("Multi-modal prompt is not "
"supported for cross encoding")
"supported for scoring")
elif "prompt_token_ids" in prompt:
prompt = tokenizer.decode(
cast(TokensPrompt, prompt)["prompt_token_ids"])
Expand All @@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt):
if len(text_2) == 0:
raise ValueError("At least one text_pair element must be given")

if len(text_1) == 1:
text_1 = text_1 * len(text_2)

input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)]
pooling_params = PoolingParams()

tokenization_kwargs: Dict[str, Any] = {}
if truncate_prompt_tokens is not None:
tokenization_kwargs["truncation"] = True
tokenization_kwargs["max_length"] = truncate_prompt_tokens

parsed_prompts = []

for q, t in input_pairs:
prompt_inputs = tokenizer(text=q,
text_pair=t,
**tokenization_kwargs)
engine_prompt = TokensPrompt(
prompt_token_ids=prompt_inputs["input_ids"],
token_type_ids=prompt_inputs.get("token_type_ids"))
parsed_prompts.append(engine_prompt)

self._validate_and_add_requests(
prompts=parsed_prompts,
params=pooling_params,
lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
)

outputs = self._run_engine(use_tqdm=use_tqdm)
items = self.engine_class.validate_outputs(outputs,
PoolingRequestOutput)

return [ScoringRequestOutput.from_base(item) for item in items]
if self.llm_engine.model_config.is_cross_encoder:
return self._cross_encoding_score(tokenizer, text_1, text_2,
truncate_prompt_tokens, use_tqdm,
lora_request,
prompt_adapter_request)
else:
return self._embedding_score(tokenizer, text_1, text_2,
truncate_prompt_tokens, use_tqdm,
lora_request, prompt_adapter_request)

def start_profile(self) -> None:
self.llm_engine.start_profile()
Expand Down
Loading