Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use chunk data in NIAH and QA evals #1176

Merged
merged 11 commits into from
Oct 7, 2024
2 changes: 1 addition & 1 deletion src/leapfrogai_evals/.env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev"
LEAPFROGAI_API_KEY="lfai-api-key"
ANTHROPIC_API_KEY="anthropic-api-key"

Expand Down
3 changes: 2 additions & 1 deletion src/leapfrogai_evals/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ cp .env.example .env
Within `.env`, replace the necessary environment variables:

```bash
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev for development>
LEAPFROGAI_API_KEY=<LeapfrogAI API key>
ANTHROPIC_API_KEY=<Anthropic API key>
```
Expand Down Expand Up @@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process:
- prompt the LLM to provide the secret code hidden in the context
- record the following:
- whether or not the needle text was returned by the retrieval step of RAG
- which chunk from the retrieval step the needle was found in, if present
- whether or not the needle text was returned by the LLM's final response
- delete the contextual document from the vector store
- delete the assistant
Expand Down
6 changes: 4 additions & 2 deletions src/leapfrogai_evals/evals/niah_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response
from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank
from leapfrogai_evals.runners import NIAH_Runner


Expand All @@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict:
additional_metadata={
"retrieval_score": row["retrieval_score"],
"response_score": row["response_score"],
"chunk_rank": row["chunk_rank"],
},
)
)
Expand All @@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict:
# TODO: Give ability to choose which metrics to run
retrieval_metric = NIAH_Retrieval()
response_metric = NIAH_Response()
metrics = [retrieval_metric, response_metric]
chunk_rank_metric = NIAH_Chunk_Rank()
metrics = [retrieval_metric, response_metric, chunk_rank_metric]

# record scores and return results
for metric in metrics:
Expand Down
12 changes: 10 additions & 2 deletions src/leapfrogai_evals/evals/qa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
import numpy as np
import os

from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import (
AnswerRelevancyMetric,
ContextualRelevancyMetric,
FaithfulnessMetric,
)
from deepeval.test_case import LLMTestCase

from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric
Expand All @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict:
actual_output=row["actual_output"],
context=row["context"],
expected_output=row["expected_output"],
retrieval_context=row["retrieval_context"],
additional_metadata={
"actual_annotations": row["actual_annotations"],
"expected_annotations": row["expected_annotations"],
},
# retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
)
)

Expand All @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict:
# TODO: Give ability to choose which metrics to run
correctness_metric = CorrectnessMetric(model=judge_model)
answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model)
faithfulness_metric = FaithfulnessMetric(model=judge_model)
annotation_relevancy_metric = AnnotationRelevancyMetric()
metrics = [
correctness_metric,
answer_relevancy_metric,
contextual_relevancy_metric,
faithfulness_metric,
annotation_relevancy_metric,
]

Expand Down
6 changes: 5 additions & 1 deletion src/leapfrogai_evals/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@

from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
from leapfrogai_evals.metrics.correctness import CorrectnessMetric
from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval
from leapfrogai_evals.metrics.niah_metrics import (
NIAH_Response,
NIAH_Retrieval,
NIAH_Chunk_Rank,
)
54 changes: 54 additions & 0 deletions src/leapfrogai_evals/metrics/niah_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,57 @@ def is_successful(self) -> bool:
@property
def __name__(self):
return "Needle in a Haystack (NIAH) Response"


class NIAH_Chunk_Rank(BaseMetric):
"""A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation"""

def __init__(
self,
threshold: float = 1.0,
async_mode: bool = True,
):
self.threshold = threshold
self.async_mode = async_mode

def measure(self, test_case: LLMTestCase) -> int:
"""
Records the niah chunk_rank from the test case

This function checks for the presence of a chunk rank (provided by the niah_runner)
and sets a boolean determined by said score. The score is calculated in the runner to keep the
runner self-contained as a means of running the entire evaluation on its own. For simplicity,
the score is copied here for integration with DeepEval.

params:
-------
test_case: LLMTestCase
A test case object built from the results of a needle in a haystack evaluation run.
test_case should contain an additional metadata field that returns a dictionary with
the field "chunk_rank"

returns:
-------
int
A score that is equal to the "chunk_rank" from the test_case
"""
self.score = test_case.additional_metadata["chunk_rank"]
self.success = self.score >= self.threshold

if self.success:
self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}"
else:
self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}"

return self.score

async def a_measure(self, test_case: LLMTestCase) -> int:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.measure, test_case)

def is_successful(self) -> bool:
return self.success

@property
def __name__(self):
return "Needle in a Haystack (NIAH) Chunk Rank"
2 changes: 1 addition & 1 deletion src/leapfrogai_evals/models/lfai.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(
):
self.model = model or os.getenv("MODEL_TO_EVALUATE")
self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY")
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL")
self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1"
self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)

def load_model(self):
Expand Down
92 changes: 71 additions & 21 deletions src/leapfrogai_evals/runners/niah_runner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import ast
import logging
import numpy as np
import os
import openai
import requests

from datasets import load_dataset, concatenate_datasets
from distutils.util import strtobool
Expand Down Expand Up @@ -78,7 +80,7 @@ def __init__(
)

self.client = openai.OpenAI(
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"),
base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1",
api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"),
)
logging.info(f"client url: {self.client.base_url}")
Expand All @@ -91,8 +93,6 @@ def __init__(
num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)),
)
self._create_vector_store()
self.retrieval_score = None
self.response_score = None

def run_experiment(self, cleanup: bool = True) -> None:
"""
Expand All @@ -110,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None:
try:
retrieval_scores = []
response_scores = []
chunk_ranks = []
response_contents = []

for row in tqdm(self.niah_data, desc="Evaluating data rows"):
Expand Down Expand Up @@ -162,32 +163,51 @@ def run_experiment(self, cleanup: bool = True) -> None:

retrieval_score = 0.0
response_score = 0.0
chunk_rank = 0.0
response_content = ""

for response in response_messages:
response_content += response.content[0].text.value + "\n"
secret_code = row["secret_code"]
chunk_ids = ast.literal_eval(response.metadata["vector_ids"])

# retrieval_score
# 1 if needle text was returned by the retrieval step of RAG else 0
logging.debug(
f"number of annotations in response: {len(response.content[0].text.annotations)}"
)
for annotation in response.content[0].text.annotations:
annotation_id = annotation.file_citation.file_id
if annotation_id == self.current_file:
logging.debug("Setting retrieval_score to 1.0")
# 1 if needle text is found in any chunk in the context, else 0
# chunk_rank
# see _calculate_chunk_rank for explanation
for chunk_num, chunk_id in enumerate(chunk_ids):
logging.info(f"chunk {chunk_num} (id: {chunk_id})")
vector_response = requests.get(
url=os.getenv("LEAPFROGAI_API_URL")
+ "/leapfrogai/v1/vector_stores/vector/"
+ chunk_id,
headers={
"accept": "application/json",
"Authorization": "Bearer "
+ os.getenv("LEAPFROGAI_API_KEY"),
},
).json()
logging.info(f"chunk_data: {vector_response['content']}")

if secret_code in vector_response["content"]:
logging.info(
f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}"
)
chunk_rank = self._calculate_chunk_rank(
chunk_place=chunk_num, total_chunks=len(chunk_ids)
)
retrieval_score = 1.0

# # response_score
# # 1 if needle text was returned by the LLM's final response else 0
secret_code = row["secret_code"]
# response_score
# 1 if needle text was returned by the LLM's final response else 0
logging.info(f"Response message: {response.content[0].text.value}")
if secret_code in response.content[0].text.value:
logging.debug("Setting response_score to 1.0")
response_score = 1.0

retrieval_scores.append(retrieval_score)
response_scores.append(response_score)
chunk_ranks.append(chunk_rank)
response_contents.append(response_content)

# delete file to clean up the vector store
Expand All @@ -210,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None:
self.niah_data = self.niah_data.add_column(
name="response_score", column=response_scores
)
self.niah_data = self.niah_data.add_column(
name="chunk_rank", column=chunk_ranks
)
self.niah_data = self.niah_data.add_column(
name="response", column=response_contents
)

self.retrieval_score = np.mean(retrieval_scores)
self.response_score = np.mean(response_scores)

logging.info(f"Retrieval Score {self.retrieval_score}")
logging.info(f"Response Score {self.response_score}")
logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}")
logging.info(f"Response Score: {np.mean(response_scores)}")
logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}")

# remove artifacts from the API if the experiment fails
except Exception as exc:
Expand Down Expand Up @@ -264,7 +285,8 @@ def _load_niah_dataset(
"""
logging.info(f"Downloading dataset: {dataset_name} from HuggingFace")
niah_dataset = load_dataset(dataset_name)
self.padding = niah_dataset["padding"]
if self.add_padding:
self.padding = niah_dataset["padding"]
niah_dataset = concatenate_datasets(
[
niah_dataset["base_eval"],
Expand Down Expand Up @@ -339,8 +361,11 @@ def _create_vector_store(self) -> VectorStore:
logging.debug(
f"Added {len(self.padding)} files as padding to the haystack vector store"
)
self.padding = self.padding.add_column(
name="padding_id", column=padding_ids
)

self.vector_store = vector_store
self.padding = self.padding.add_column(name="padding_id", column=padding_ids)

def _delete_vector_store(self, vector_store_id: str) -> None:
"""Deletes the vector store used for all NIAH evaluations"""
Expand All @@ -360,3 +385,28 @@ def _delete_file(self, file_id: str) -> None:
file_id=file_id, vector_store_id=self.vector_store.id
)
self.client.files.delete(file_id=file_id)

def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float:
"""
Calculate an individual chunk's rank

When a needle is found in a certain chunk, we caclulate the rank of that chunk
This rank is based on what place in the responses it came (between 0 and total_chunks-1)
using this formula:

chunk_rank_score = (total_chunks - chunk_place) / total_chunks

e.g
total_chunks = 5
chunk_place = 0 (first in the list)
chunk_rank_score = (5 - 0) / 5 = 1.0

e.g
total_chunks = 5
chunk_place = 4 (last in 0 indexed list)
chunk_rank_score = (5 - 4) / 5 = 0.2

not finding the needle results in a score of 0 (set outside this function)
"""
chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks)
return chunk_rank_score
Loading
Loading