diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example index 1235737c6..9a9370e29 100644 --- a/src/leapfrogai_evals/.env.example +++ b/src/leapfrogai_evals/.env.example @@ -1,4 +1,4 @@ -LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1" +LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev" LEAPFROGAI_API_KEY="lfai-api-key" ANTHROPIC_API_KEY="anthropic-api-key" diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md index 857ad8958..a7938cc90 100644 --- a/src/leapfrogai_evals/README.md +++ b/src/leapfrogai_evals/README.md @@ -18,7 +18,7 @@ cp .env.example .env Within `.env`, replace the necessary environment variables: ```bash -LEAPFROGAI_API_URL= +LEAPFROGAI_API_URL= LEAPFROGAI_API_KEY= ANTHROPIC_API_KEY= ``` @@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process: - prompt the LLM to provide the secret code hidden in the context - record the following: - whether or not the needle text was returned by the retrieval step of RAG + - which chunk from the retrieval step the needle was found in, if present - whether or not the needle text was returned by the LLM's final response - delete the contextual document from the vector store - delete the assistant diff --git a/src/leapfrogai_evals/evals/niah_eval.py b/src/leapfrogai_evals/evals/niah_eval.py index 9515f7030..2057a4124 100644 --- a/src/leapfrogai_evals/evals/niah_eval.py +++ b/src/leapfrogai_evals/evals/niah_eval.py @@ -3,7 +3,7 @@ from deepeval.test_case import LLMTestCase -from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response +from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank from leapfrogai_evals.runners import NIAH_Runner @@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict: additional_metadata={ "retrieval_score": row["retrieval_score"], "response_score": row["response_score"], + "chunk_rank": row["chunk_rank"], }, ) ) @@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict: # TODO: Give ability to choose which metrics to run retrieval_metric = NIAH_Retrieval() response_metric = NIAH_Response() - metrics = [retrieval_metric, response_metric] + chunk_rank_metric = NIAH_Chunk_Rank() + metrics = [retrieval_metric, response_metric, chunk_rank_metric] # record scores and return results for metric in metrics: diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py index 88cb60926..941217a85 100644 --- a/src/leapfrogai_evals/evals/qa_eval.py +++ b/src/leapfrogai_evals/evals/qa_eval.py @@ -2,7 +2,11 @@ import numpy as np import os -from deepeval.metrics import AnswerRelevancyMetric +from deepeval.metrics import ( + AnswerRelevancyMetric, + ContextualRelevancyMetric, + FaithfulnessMetric, +) from deepeval.test_case import LLMTestCase from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict: actual_output=row["actual_output"], context=row["context"], expected_output=row["expected_output"], + retrieval_context=row["retrieval_context"], additional_metadata={ "actual_annotations": row["actual_annotations"], "expected_annotations": row["expected_annotations"], }, - # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics ) ) @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict: # TODO: Give ability to choose which metrics to run correctness_metric = CorrectnessMetric(model=judge_model) answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) + contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model) + faithfulness_metric = FaithfulnessMetric(model=judge_model) annotation_relevancy_metric = AnnotationRelevancyMetric() metrics = [ correctness_metric, answer_relevancy_metric, + contextual_relevancy_metric, + faithfulness_metric, annotation_relevancy_metric, ] diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py index 428d526d5..16866822c 100644 --- a/src/leapfrogai_evals/metrics/__init__.py +++ b/src/leapfrogai_evals/metrics/__init__.py @@ -3,4 +3,8 @@ from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric from leapfrogai_evals.metrics.correctness import CorrectnessMetric -from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval +from leapfrogai_evals.metrics.niah_metrics import ( + NIAH_Response, + NIAH_Retrieval, + NIAH_Chunk_Rank, +) diff --git a/src/leapfrogai_evals/metrics/niah_metrics.py b/src/leapfrogai_evals/metrics/niah_metrics.py index 8595df165..1c5143a7a 100644 --- a/src/leapfrogai_evals/metrics/niah_metrics.py +++ b/src/leapfrogai_evals/metrics/niah_metrics.py @@ -109,3 +109,57 @@ def is_successful(self) -> bool: @property def __name__(self): return "Needle in a Haystack (NIAH) Response" + + +class NIAH_Chunk_Rank(BaseMetric): + """A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation""" + + def __init__( + self, + threshold: float = 1.0, + async_mode: bool = True, + ): + self.threshold = threshold + self.async_mode = async_mode + + def measure(self, test_case: LLMTestCase) -> int: + """ + Records the niah chunk_rank from the test case + + This function checks for the presence of a chunk rank (provided by the niah_runner) + and sets a boolean determined by said score. The score is calculated in the runner to keep the + runner self-contained as a means of running the entire evaluation on its own. For simplicity, + the score is copied here for integration with DeepEval. + + params: + ------- + test_case: LLMTestCase + A test case object built from the results of a needle in a haystack evaluation run. + test_case should contain an additional metadata field that returns a dictionary with + the field "chunk_rank" + + returns: + ------- + int + A score that is equal to the "chunk_rank" from the test_case + """ + self.score = test_case.additional_metadata["chunk_rank"] + self.success = self.score >= self.threshold + + if self.success: + self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}" + else: + self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}" + + return self.score + + async def a_measure(self, test_case: LLMTestCase) -> int: + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.measure, test_case) + + def is_successful(self) -> bool: + return self.success + + @property + def __name__(self): + return "Needle in a Haystack (NIAH) Chunk Rank" diff --git a/src/leapfrogai_evals/models/lfai.py b/src/leapfrogai_evals/models/lfai.py index fca7c8de4..88cc65741 100644 --- a/src/leapfrogai_evals/models/lfai.py +++ b/src/leapfrogai_evals/models/lfai.py @@ -24,7 +24,7 @@ def __init__( ): self.model = model or os.getenv("MODEL_TO_EVALUATE") self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") - self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1" self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) def load_model(self): diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py index 7da66e7df..4f711bdff 100644 --- a/src/leapfrogai_evals/runners/niah_runner.py +++ b/src/leapfrogai_evals/runners/niah_runner.py @@ -1,7 +1,9 @@ +import ast import logging import numpy as np import os import openai +import requests from datasets import load_dataset, concatenate_datasets from distutils.util import strtobool @@ -78,7 +80,7 @@ def __init__( ) self.client = openai.OpenAI( - base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"), + base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1", api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"), ) logging.info(f"client url: {self.client.base_url}") @@ -91,8 +93,6 @@ def __init__( num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)), ) self._create_vector_store() - self.retrieval_score = None - self.response_score = None def run_experiment(self, cleanup: bool = True) -> None: """ @@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None: try: retrieval_scores = [] response_scores = [] + chunk_ranks = [] response_contents = [] for row in tqdm(self.niah_data, desc="Evaluating data rows"): @@ -162,25 +163,43 @@ def run_experiment(self, cleanup: bool = True) -> None: retrieval_score = 0.0 response_score = 0.0 + chunk_rank = 0.0 response_content = "" for response in response_messages: response_content += response.content[0].text.value + "\n" + secret_code = row["secret_code"] + chunk_ids = ast.literal_eval(response.metadata["vector_ids"]) # retrieval_score - # 1 if needle text was returned by the retrieval step of RAG else 0 - logging.debug( - f"number of annotations in response: {len(response.content[0].text.annotations)}" - ) - for annotation in response.content[0].text.annotations: - annotation_id = annotation.file_citation.file_id - if annotation_id == self.current_file: - logging.debug("Setting retrieval_score to 1.0") + # 1 if needle text is found in any chunk in the context, else 0 + # chunk_rank + # see _calculate_chunk_rank for explanation + for chunk_num, chunk_id in enumerate(chunk_ids): + logging.info(f"chunk {chunk_num} (id: {chunk_id})") + vector_response = requests.get( + url=os.getenv("LEAPFROGAI_API_URL") + + "/leapfrogai/v1/vector_stores/vector/" + + chunk_id, + headers={ + "accept": "application/json", + "Authorization": "Bearer " + + os.getenv("LEAPFROGAI_API_KEY"), + }, + ).json() + logging.info(f"chunk_data: {vector_response['content']}") + + if secret_code in vector_response["content"]: + logging.info( + f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}" + ) + chunk_rank = self._calculate_chunk_rank( + chunk_place=chunk_num, total_chunks=len(chunk_ids) + ) retrieval_score = 1.0 - # # response_score - # # 1 if needle text was returned by the LLM's final response else 0 - secret_code = row["secret_code"] + # response_score + # 1 if needle text was returned by the LLM's final response else 0 logging.info(f"Response message: {response.content[0].text.value}") if secret_code in response.content[0].text.value: logging.debug("Setting response_score to 1.0") @@ -188,6 +207,7 @@ def run_experiment(self, cleanup: bool = True) -> None: retrieval_scores.append(retrieval_score) response_scores.append(response_score) + chunk_ranks.append(chunk_rank) response_contents.append(response_content) # delete file to clean up the vector store @@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None: self.niah_data = self.niah_data.add_column( name="response_score", column=response_scores ) + self.niah_data = self.niah_data.add_column( + name="chunk_rank", column=chunk_ranks + ) self.niah_data = self.niah_data.add_column( name="response", column=response_contents ) - self.retrieval_score = np.mean(retrieval_scores) - self.response_score = np.mean(response_scores) - - logging.info(f"Retrieval Score {self.retrieval_score}") - logging.info(f"Response Score {self.response_score}") + logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}") + logging.info(f"Response Score: {np.mean(response_scores)}") + logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}") # remove artifacts from the API if the experiment fails except Exception as exc: @@ -264,7 +285,8 @@ def _load_niah_dataset( """ logging.info(f"Downloading dataset: {dataset_name} from HuggingFace") niah_dataset = load_dataset(dataset_name) - self.padding = niah_dataset["padding"] + if self.add_padding: + self.padding = niah_dataset["padding"] niah_dataset = concatenate_datasets( [ niah_dataset["base_eval"], @@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore: logging.debug( f"Added {len(self.padding)} files as padding to the haystack vector store" ) + self.padding = self.padding.add_column( + name="padding_id", column=padding_ids + ) + self.vector_store = vector_store - self.padding = self.padding.add_column(name="padding_id", column=padding_ids) def _delete_vector_store(self, vector_store_id: str) -> None: """Deletes the vector store used for all NIAH evaluations""" @@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None: file_id=file_id, vector_store_id=self.vector_store.id ) self.client.files.delete(file_id=file_id) + + def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float: + """ + Calculate an individual chunk's rank + + When a needle is found in a certain chunk, we caclulate the rank of that chunk + This rank is based on what place in the responses it came (between 0 and total_chunks-1) + using this formula: + + chunk_rank_score = (total_chunks - chunk_place) / total_chunks + + e.g + total_chunks = 5 + chunk_place = 0 (first in the list) + chunk_rank_score = (5 - 0) / 5 = 1.0 + + e.g + total_chunks = 5 + chunk_place = 4 (last in 0 indexed list) + chunk_rank_score = (5 - 4) / 5 = 0.2 + + not finding the needle results in a score of 0 (set outside this function) + """ + chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks) + return chunk_rank_score diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py index 4875e2ff8..cd297689e 100644 --- a/src/leapfrogai_evals/runners/qa_runner.py +++ b/src/leapfrogai_evals/runners/qa_runner.py @@ -1,6 +1,8 @@ +import ast import logging import os import openai +import requests import shutil import zipfile @@ -52,6 +54,7 @@ def __init__( self.vector_store = None self.file_dict = None self.current_assistant = None + self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") self.dataset_name = os.environ.get("QA_DATASET", dataset) self.model = os.environ.get("MODEL_TO_EVALUATE", model) self.temperature = float(os.environ.get("TEMPERATURE", temperature)) @@ -72,8 +75,8 @@ def __init__( ) self.client = openai.OpenAI( - base_url=base_url or os.getenv("LEAPFROGAI_API_URL"), - api_key=api_key or os.getenv("LEAPFROGAI_API_KEY"), + base_url=base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1", + api_key=self.api_key, ) logging.info(f"client url: {self.client.base_url}") try: # use existing vector store if supplied @@ -101,6 +104,7 @@ def run_experiment(self) -> None: try: response_contents = [] + retrieved_contexts = [] expected_annotations = [] actual_annotations = [] @@ -132,21 +136,40 @@ def run_experiment(self) -> None: response_messages.append(message) response_content = "" + retrieved_context = [] response_annotations = [] for response in response_messages: response_content += response.content[0].text.value + "\n" + chunk_ids = ast.literal_eval(response.metadata["vector_ids"]) + + # retrieve context used to generate response + for chunk_id in chunk_ids: + vector_response = requests.get( + url=os.getenv("LEAPFROGAI_API_URL") + + "/leapfrogai/v1/vector_stores/vector/" + + chunk_id, + headers={ + "accept": "application/json", + "Authorization": "Bearer " + self.api_key, + }, + ).json() + retrieved_context.append(vector_response["content"]) for annotation in response.content[0].text.annotations: annotation_id = annotation.file_citation.file_id response_annotations.append(annotation_id) - logging.debug( + logging.info( f"number of annotations in response: {len(response.content[0].text.annotations)}" ) expected_annotations.append([self.file_dict[row["source_file"]]]) actual_annotations.append(response_annotations) + logging.info( + f"Retrieved context recorded: {vector_response['content']}" + ) + retrieved_contexts.append(retrieved_context) logging.info(f"Response recorded:\n{response_content}") response_contents.append(response_content) @@ -159,6 +182,9 @@ def run_experiment(self) -> None: self.qa_data = self.qa_data.add_column( name="actual_output", column=response_contents ) + self.qa_data = self.qa_data.add_column( + name="retrieval_context", column=retrieved_contexts + ) self.qa_data = self.qa_data.add_column( name="expected_annotations", column=expected_annotations )