diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index be6e3842821e2..3db27d942ac8c 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -5,12 +5,18 @@ import math import pytest +import torch +import torch.nn.functional as F MODELS = [ "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "BAAI/bge-reranker-v2-m3", # Roberta ] +EMBEDDING_MODELS = [ + "sentence-transformers/all-MiniLM-L12-v2", +] + TEXTS_1 = [ "What is the capital of France?", "What is the capital of Germany?", @@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.fixture(scope="module", params=EMBEDDING_MODELS) +def emb_model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = hf_model.encode(text_pair) + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0) + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 27386daa4bbc9..9cc294ab50363 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -5,6 +5,7 @@ Tuple, Type, Union, cast, overload) import cloudpickle +import torch import torch.nn as nn from tqdm import tqdm from typing_extensions import TypeVar, deprecated @@ -996,6 +997,107 @@ def classify( return [ClassificationRequestOutput.from_base(item) for item in items] + def _embedding_score( + self, + tokenizer: AnyTokenizer, + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + encoded_output = self.encode( + text_1 + text_2, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + encoded_output_1 = encoded_output[0:len(text_1)] + encoded_output_2 = encoded_output[len(text_1):] + + if len(encoded_output_1) == 1: + encoded_output_1 = encoded_output_1 * len(encoded_output_2) + + output_pairs = [(t1, t2) + for t1, t2 in zip(encoded_output_1, encoded_output_2)] + + scores = [] + scorer = torch.nn.CosineSimilarity(0) + + for embed_1, embed_2 in output_pairs: + pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data) + + if (pad_token_id := getattr(tokenizer, "pad_token_id", + None)) is not None: + tokens = embed_1.prompt_token_ids + [ + pad_token_id + ] + embed_2.prompt_token_ids + else: + tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids + + scores.append( + PoolingRequestOutput( + request_id=f"{embed_1.request_id}_{embed_2.request_id}", + outputs=pair_score, + prompt_token_ids=tokens, + finished=True)) + + items = self.engine_class.validate_outputs(scores, + PoolingRequestOutput) + return [ScoringRequestOutput.from_base(item) for item in items] + + def _cross_encoding_score( + self, + tokenizer: Union[AnyTokenizer], + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Score API is only enabled for `--task embed or score`") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + + pooling_params = PoolingParams() + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + parsed_prompts = [] + + for q, t in input_pairs: + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=parsed_prompts, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + items = self.engine_class.validate_outputs(outputs, + PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] + def score( self, text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], @@ -1047,25 +1149,20 @@ def score( raise ValueError(" ".join(messages)) - if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support cross encoding") - if self.llm_engine.model_config.task != "score": - raise ValueError("Score API is only enabled for `--task score`") - - tokenizer = self.llm_engine.get_tokenizer() - - if isinstance(tokenizer, MistralTokenizer): + if self.llm_engine.model_config.task not in ("embed", "score"): raise ValueError( - "MistralTokenizer not supported for cross-encoding") + "Score API is only enabled for `--task embed or --task score`") # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs + tokenizer = self.llm_engine.get_tokenizer() + def ensure_str(prompt: SingletonPrompt): if isinstance(prompt, dict): if "multi_modal_data" in prompt: raise ValueError("Multi-modal prompt is not " - "supported for cross encoding") + "supported for scoring") elif "prompt_token_ids" in prompt: prompt = tokenizer.decode( cast(TokensPrompt, prompt)["prompt_token_ids"]) @@ -1091,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt): if len(text_2) == 0: raise ValueError("At least one text_pair element must be given") - if len(text_1) == 1: - text_1 = text_1 * len(text_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] - pooling_params = PoolingParams() - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - parsed_prompts = [] - - for q, t in input_pairs: - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) - - self._validate_and_add_requests( - prompts=parsed_prompts, - params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - - outputs = self._run_engine(use_tqdm=use_tqdm) - items = self.engine_class.validate_outputs(outputs, - PoolingRequestOutput) - - return [ScoringRequestOutput.from_base(item) for item in items] + if self.llm_engine.model_config.is_cross_encoder: + return self._cross_encoding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, + prompt_adapter_request) + else: + return self._embedding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, prompt_adapter_request) def start_profile(self) -> None: self.llm_engine.start_profile()