From 3aade0912df63bc3898b0d88493b82ee40ec78b6 Mon Sep 17 00:00:00 2001 From: John Alling Date: Tue, 1 Oct 2024 18:27:33 -0400 Subject: [PATCH 1/9] begin reading chunk data from annotations --- src/leapfrogai_evals/runners/niah_runner.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py index 7da66e7df..f88f33dba 100644 --- a/src/leapfrogai_evals/runners/niah_runner.py +++ b/src/leapfrogai_evals/runners/niah_runner.py @@ -2,6 +2,7 @@ import numpy as np import os import openai +import requests from datasets import load_dataset, concatenate_datasets from distutils.util import strtobool @@ -167,6 +168,23 @@ def run_experiment(self, cleanup: bool = True) -> None: for response in response_messages: response_content += response.content[0].text.value + "\n" + secret_code = row["secret_code"] + + # get chunk data from response + chunk_vectors = response.content[0].metadata["vector_ids"] + + for chunk_vector in chunk_vectors: + request_params = { + "vector_id": chunk_vector, + "bearer": os.getenv("LEAPFROGAI_API_KEY"), + } + vector_response = requests.get( + os.getenv("LEAPFROGAI_BASE_URL"), params=request_params + ) + + if secret_code in vector_response.json(): + retrieval_score = 1.0 + # retrieval_score # 1 if needle text was returned by the retrieval step of RAG else 0 logging.debug( @@ -180,7 +198,6 @@ def run_experiment(self, cleanup: bool = True) -> None: # # response_score # # 1 if needle text was returned by the LLM's final response else 0 - secret_code = row["secret_code"] logging.info(f"Response message: {response.content[0].text.value}") if secret_code in response.content[0].text.value: logging.debug("Setting response_score to 1.0") From 8b7e0808a127899f3edd3bf0dd4f7b0e937c17fc Mon Sep 17 00:00:00 2001 From: John Alling Date: Wed, 2 Oct 2024 13:01:36 -0400 Subject: [PATCH 2/9] upgrade openai version --- src/leapfrogai_evals/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml index 1974da81a..f6687f368 100644 --- a/src/leapfrogai_evals/pyproject.toml +++ b/src/leapfrogai_evals/pyproject.toml @@ -8,7 +8,7 @@ version = "0.13.1" dependencies = [ "deepeval == 1.3.0", - "openai == 1.42.0", + "openai == 1.45.0", "tqdm == 4.66.5", "python-dotenv == 1.0.1", "seaborn == 0.13.2", From 1cde77e414e218e2eb93aba87722733cadedfc4c Mon Sep 17 00:00:00 2001 From: John Alling Date: Wed, 2 Oct 2024 16:20:56 -0400 Subject: [PATCH 3/9] add chunk_rank metric to niah --- src/leapfrogai_evals/evals/niah_eval.py | 6 +- src/leapfrogai_evals/metrics/__init__.py | 6 +- src/leapfrogai_evals/metrics/niah_metrics.py | 54 ++++++++++ src/leapfrogai_evals/runners/niah_runner.py | 103 ++++++++++++------- 4 files changed, 131 insertions(+), 38 deletions(-) diff --git a/src/leapfrogai_evals/evals/niah_eval.py b/src/leapfrogai_evals/evals/niah_eval.py index 9515f7030..2057a4124 100644 --- a/src/leapfrogai_evals/evals/niah_eval.py +++ b/src/leapfrogai_evals/evals/niah_eval.py @@ -3,7 +3,7 @@ from deepeval.test_case import LLMTestCase -from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response +from leapfrogai_evals.metrics import NIAH_Retrieval, NIAH_Response, NIAH_Chunk_Rank from leapfrogai_evals.runners import NIAH_Runner @@ -26,6 +26,7 @@ def niah_eval(*args, **kwargs) -> dict: additional_metadata={ "retrieval_score": row["retrieval_score"], "response_score": row["response_score"], + "chunk_rank": row["chunk_rank"], }, ) ) @@ -34,7 +35,8 @@ def niah_eval(*args, **kwargs) -> dict: # TODO: Give ability to choose which metrics to run retrieval_metric = NIAH_Retrieval() response_metric = NIAH_Response() - metrics = [retrieval_metric, response_metric] + chunk_rank_metric = NIAH_Chunk_Rank() + metrics = [retrieval_metric, response_metric, chunk_rank_metric] # record scores and return results for metric in metrics: diff --git a/src/leapfrogai_evals/metrics/__init__.py b/src/leapfrogai_evals/metrics/__init__.py index 428d526d5..16866822c 100644 --- a/src/leapfrogai_evals/metrics/__init__.py +++ b/src/leapfrogai_evals/metrics/__init__.py @@ -3,4 +3,8 @@ from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric from leapfrogai_evals.metrics.correctness import CorrectnessMetric -from leapfrogai_evals.metrics.niah_metrics import NIAH_Response, NIAH_Retrieval +from leapfrogai_evals.metrics.niah_metrics import ( + NIAH_Response, + NIAH_Retrieval, + NIAH_Chunk_Rank, +) diff --git a/src/leapfrogai_evals/metrics/niah_metrics.py b/src/leapfrogai_evals/metrics/niah_metrics.py index 8595df165..1c5143a7a 100644 --- a/src/leapfrogai_evals/metrics/niah_metrics.py +++ b/src/leapfrogai_evals/metrics/niah_metrics.py @@ -109,3 +109,57 @@ def is_successful(self) -> bool: @property def __name__(self): return "Needle in a Haystack (NIAH) Response" + + +class NIAH_Chunk_Rank(BaseMetric): + """A metric for measuring the chunk rank score from the LFAI Needle in a Haystack Evaluation""" + + def __init__( + self, + threshold: float = 1.0, + async_mode: bool = True, + ): + self.threshold = threshold + self.async_mode = async_mode + + def measure(self, test_case: LLMTestCase) -> int: + """ + Records the niah chunk_rank from the test case + + This function checks for the presence of a chunk rank (provided by the niah_runner) + and sets a boolean determined by said score. The score is calculated in the runner to keep the + runner self-contained as a means of running the entire evaluation on its own. For simplicity, + the score is copied here for integration with DeepEval. + + params: + ------- + test_case: LLMTestCase + A test case object built from the results of a needle in a haystack evaluation run. + test_case should contain an additional metadata field that returns a dictionary with + the field "chunk_rank" + + returns: + ------- + int + A score that is equal to the "chunk_rank" from the test_case + """ + self.score = test_case.additional_metadata["chunk_rank"] + self.success = self.score >= self.threshold + + if self.success: + self.reason = f"Response in the NIAH evaluation scored greater than or equal to the threshold score of {self.threshold}" + else: + self.reason = f"Response in the NIAH evaluation scored less than the threshold score of {self.threshold}" + + return self.score + + async def a_measure(self, test_case: LLMTestCase) -> int: + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.measure, test_case) + + def is_successful(self) -> bool: + return self.success + + @property + def __name__(self): + return "Needle in a Haystack (NIAH) Chunk Rank" diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py index f88f33dba..99bfae069 100644 --- a/src/leapfrogai_evals/runners/niah_runner.py +++ b/src/leapfrogai_evals/runners/niah_runner.py @@ -1,3 +1,4 @@ +import ast import logging import numpy as np import os @@ -92,8 +93,6 @@ def __init__( num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)), ) self._create_vector_store() - self.retrieval_score = None - self.response_score = None def run_experiment(self, cleanup: bool = True) -> None: """ @@ -111,6 +110,7 @@ def run_experiment(self, cleanup: bool = True) -> None: try: retrieval_scores = [] response_scores = [] + chunk_ranks = [] response_contents = [] for row in tqdm(self.niah_data, desc="Evaluating data rows"): @@ -163,41 +163,43 @@ def run_experiment(self, cleanup: bool = True) -> None: retrieval_score = 0.0 response_score = 0.0 + chunk_rank = 0.0 response_content = "" for response in response_messages: response_content += response.content[0].text.value + "\n" - secret_code = row["secret_code"] - - # get chunk data from response - chunk_vectors = response.content[0].metadata["vector_ids"] - - for chunk_vector in chunk_vectors: - request_params = { - "vector_id": chunk_vector, - "bearer": os.getenv("LEAPFROGAI_API_KEY"), - } - vector_response = requests.get( - os.getenv("LEAPFROGAI_BASE_URL"), params=request_params - ) - - if secret_code in vector_response.json(): - retrieval_score = 1.0 + chunk_ids = ast.literal_eval(response.metadata["vector_ids"]) # retrieval_score - # 1 if needle text was returned by the retrieval step of RAG else 0 - logging.debug( - f"number of annotations in response: {len(response.content[0].text.annotations)}" - ) - for annotation in response.content[0].text.annotations: - annotation_id = annotation.file_citation.file_id - if annotation_id == self.current_file: - logging.debug("Setting retrieval_score to 1.0") + # 1 if needle text is found in any chunk in the context, else 0 + # chunk_rank + # see _calculate_chunk_rank for explanation + for chunk_num, chunk_id in enumerate(chunk_ids): + logging.info(f"chunk {chunk_num} (id: {chunk_id})") + vector_response = requests.get( + url=os.getenv("LEAPFROGAI_API_LFAI_URL") + + "/vector_stores/vector/" + + chunk_id, + headers={ + "accept": "application/json", + "Authorization": "Bearer " + + os.getenv("LEAPFROGAI_API_KEY"), + }, + ).json() + logging.info(f"chunk_data: {vector_response['content']}") + + if secret_code in vector_response["content"]: + logging.info( + f"secret code {secret_code} found in chunk {chunk_num} with id {vector_response['id']}" + ) + chunk_rank = self._calculate_chunk_rank( + chunk_place=chunk_num, total_chunks=len(chunk_ids) + ) retrieval_score = 1.0 - # # response_score - # # 1 if needle text was returned by the LLM's final response else 0 + # response_score + # 1 if needle text was returned by the LLM's final response else 0 logging.info(f"Response message: {response.content[0].text.value}") if secret_code in response.content[0].text.value: logging.debug("Setting response_score to 1.0") @@ -205,6 +207,7 @@ def run_experiment(self, cleanup: bool = True) -> None: retrieval_scores.append(retrieval_score) response_scores.append(response_score) + chunk_ranks.append(chunk_rank) response_contents.append(response_content) # delete file to clean up the vector store @@ -227,15 +230,16 @@ def run_experiment(self, cleanup: bool = True) -> None: self.niah_data = self.niah_data.add_column( name="response_score", column=response_scores ) + self.niah_data = self.niah_data.add_column( + name="chunk_rank", column=chunk_ranks + ) self.niah_data = self.niah_data.add_column( name="response", column=response_contents ) - self.retrieval_score = np.mean(retrieval_scores) - self.response_score = np.mean(response_scores) - - logging.info(f"Retrieval Score {self.retrieval_score}") - logging.info(f"Response Score {self.response_score}") + logging.info(f"Retrieval Score: {np.mean(retrieval_scores)}") + logging.info(f"Response Score: {np.mean(response_scores)}") + logging.info(f"Chunk Rank Score: {np.mean(chunk_ranks)}") # remove artifacts from the API if the experiment fails except Exception as exc: @@ -281,7 +285,8 @@ def _load_niah_dataset( """ logging.info(f"Downloading dataset: {dataset_name} from HuggingFace") niah_dataset = load_dataset(dataset_name) - self.padding = niah_dataset["padding"] + if self.add_padding: + self.padding = niah_dataset["padding"] niah_dataset = concatenate_datasets( [ niah_dataset["base_eval"], @@ -356,8 +361,11 @@ def _create_vector_store(self) -> VectorStore: logging.debug( f"Added {len(self.padding)} files as padding to the haystack vector store" ) + self.padding = self.padding.add_column( + name="padding_id", column=padding_ids + ) + self.vector_store = vector_store - self.padding = self.padding.add_column(name="padding_id", column=padding_ids) def _delete_vector_store(self, vector_store_id: str) -> None: """Deletes the vector store used for all NIAH evaluations""" @@ -377,3 +385,28 @@ def _delete_file(self, file_id: str) -> None: file_id=file_id, vector_store_id=self.vector_store.id ) self.client.files.delete(file_id=file_id) + + def _calculate_chunk_rank(self, chunk_place: int, total_chunks: int) -> float: + """ + Calculate an individual chunk's rank + + When a needle is found in a certain chunk, we caclulate the rank of that chunk + This rank is based on what place in the responses it came (between 0 and total_chunks-1) + using this formula: + + chunk_rank_score = (total_chunks - chunk_place) / total_chunks + + e.g + total_chunks = 5 + chunk_place = 0 (first in the list) + chunk_rank_score = (5 - 0) / 5 = 1.0 + + e.g + total_chunks = 5 + chunk_place = 4 (last in 0 indexed list) + chunk_rank_score = (5 - 4) / 5 = 0.2 + + not finding the needle results in a score of 0 (set outside this function) + """ + chunk_rank_score = float(total_chunks - chunk_place) / float(total_chunks) + return chunk_rank_score From dc315cc0421c1e51dd0701d971826ef006689a33 Mon Sep 17 00:00:00 2001 From: John Alling Date: Wed, 2 Oct 2024 16:21:51 -0400 Subject: [PATCH 4/9] update env.example --- src/leapfrogai_evals/.env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example index 1235737c6..7ae533c01 100644 --- a/src/leapfrogai_evals/.env.example +++ b/src/leapfrogai_evals/.env.example @@ -1,4 +1,5 @@ LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1" +LEAPFROGAI_API_LFAI_URL=https://leapfrogai-api.uds.dev/leapfrogai/v1 LEAPFROGAI_API_KEY="lfai-api-key" ANTHROPIC_API_KEY="anthropic-api-key" From e62cb74d925643d051cf9a08157563b8d0cba42e Mon Sep 17 00:00:00 2001 From: John Alling Date: Wed, 2 Oct 2024 16:27:24 -0400 Subject: [PATCH 5/9] update readme with niah updates --- src/leapfrogai_evals/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md index 857ad8958..8cd853878 100644 --- a/src/leapfrogai_evals/README.md +++ b/src/leapfrogai_evals/README.md @@ -108,6 +108,7 @@ The LeapfrogAI NIAH evaluation uses the following process: - prompt the LLM to provide the secret code hidden in the context - record the following: - whether or not the needle text was returned by the retrieval step of RAG + - which chunk from the retrieval step the needle was found in, if present - whether or not the needle text was returned by the LLM's final response - delete the contextual document from the vector store - delete the assistant From c057be23453261d3e898532a1930cb7bf70cdfc0 Mon Sep 17 00:00:00 2001 From: John Alling Date: Wed, 2 Oct 2024 17:03:46 -0400 Subject: [PATCH 6/9] add retrieval context to QA evals and new metrics --- src/leapfrogai_evals/evals/qa_eval.py | 12 +++++++-- src/leapfrogai_evals/runners/qa_runner.py | 30 +++++++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/src/leapfrogai_evals/evals/qa_eval.py b/src/leapfrogai_evals/evals/qa_eval.py index 88cb60926..941217a85 100644 --- a/src/leapfrogai_evals/evals/qa_eval.py +++ b/src/leapfrogai_evals/evals/qa_eval.py @@ -2,7 +2,11 @@ import numpy as np import os -from deepeval.metrics import AnswerRelevancyMetric +from deepeval.metrics import ( + AnswerRelevancyMetric, + ContextualRelevancyMetric, + FaithfulnessMetric, +) from deepeval.test_case import LLMTestCase from leapfrogai_evals.metrics import AnnotationRelevancyMetric, CorrectnessMetric @@ -27,11 +31,11 @@ def qa_eval(*args, **kwargs) -> dict: actual_output=row["actual_output"], context=row["context"], expected_output=row["expected_output"], + retrieval_context=row["retrieval_context"], additional_metadata={ "actual_annotations": row["actual_annotations"], "expected_annotations": row["expected_annotations"], }, - # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics ) ) @@ -45,10 +49,14 @@ def qa_eval(*args, **kwargs) -> dict: # TODO: Give ability to choose which metrics to run correctness_metric = CorrectnessMetric(model=judge_model) answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model) + contextual_relevancy_metric = ContextualRelevancyMetric(model=judge_model) + faithfulness_metric = FaithfulnessMetric(model=judge_model) annotation_relevancy_metric = AnnotationRelevancyMetric() metrics = [ correctness_metric, answer_relevancy_metric, + contextual_relevancy_metric, + faithfulness_metric, annotation_relevancy_metric, ] diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py index 4875e2ff8..f06dbb303 100644 --- a/src/leapfrogai_evals/runners/qa_runner.py +++ b/src/leapfrogai_evals/runners/qa_runner.py @@ -1,6 +1,8 @@ +import ast import logging import os import openai +import requests import shutil import zipfile @@ -52,6 +54,7 @@ def __init__( self.vector_store = None self.file_dict = None self.current_assistant = None + self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") self.dataset_name = os.environ.get("QA_DATASET", dataset) self.model = os.environ.get("MODEL_TO_EVALUATE", model) self.temperature = float(os.environ.get("TEMPERATURE", temperature)) @@ -73,7 +76,7 @@ def __init__( self.client = openai.OpenAI( base_url=base_url or os.getenv("LEAPFROGAI_API_URL"), - api_key=api_key or os.getenv("LEAPFROGAI_API_KEY"), + api_key=self.api_key, ) logging.info(f"client url: {self.client.base_url}") try: # use existing vector store if supplied @@ -101,6 +104,7 @@ def run_experiment(self) -> None: try: response_contents = [] + retrieved_contexts = [] expected_annotations = [] actual_annotations = [] @@ -132,21 +136,40 @@ def run_experiment(self) -> None: response_messages.append(message) response_content = "" + retrieved_context = [] response_annotations = [] for response in response_messages: response_content += response.content[0].text.value + "\n" + chunk_ids = ast.literal_eval(response.metadata["vector_ids"]) + + # retrieve context used to generate response + for chunk_id in chunk_ids: + vector_response = requests.get( + url=os.getenv("LEAPFROGAI_API_LFAI_URL") + + "/vector_stores/vector/" + + chunk_id, + headers={ + "accept": "application/json", + "Authorization": "Bearer " + self.api_key, + }, + ).json() + retrieved_context.append(vector_response["content"]) for annotation in response.content[0].text.annotations: annotation_id = annotation.file_citation.file_id response_annotations.append(annotation_id) - logging.debug( + logging.info( f"number of annotations in response: {len(response.content[0].text.annotations)}" ) expected_annotations.append([self.file_dict[row["source_file"]]]) actual_annotations.append(response_annotations) + logging.info( + f"Retrieved context recorded: {vector_response['content']}" + ) + retrieved_contexts.append(retrieved_context) logging.info(f"Response recorded:\n{response_content}") response_contents.append(response_content) @@ -159,6 +182,9 @@ def run_experiment(self) -> None: self.qa_data = self.qa_data.add_column( name="actual_output", column=response_contents ) + self.qa_data = self.qa_data.add_column( + name="retrieval_context", column=retrieved_contexts + ) self.qa_data = self.qa_data.add_column( name="expected_annotations", column=expected_annotations ) From 4dee3a9449c2d9ad686ab3eccc8969a27425cc8f Mon Sep 17 00:00:00 2001 From: John Alling Date: Fri, 4 Oct 2024 11:20:16 -0400 Subject: [PATCH 7/9] refactor to not use multiple api env vars --- src/leapfrogai_evals/README.md | 2 +- src/leapfrogai_evals/main.py | 2 +- src/leapfrogai_evals/models/lfai.py | 2 +- src/leapfrogai_evals/runners/niah_runner.py | 6 +++--- src/leapfrogai_evals/runners/qa_runner.py | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md index 8cd853878..a7938cc90 100644 --- a/src/leapfrogai_evals/README.md +++ b/src/leapfrogai_evals/README.md @@ -18,7 +18,7 @@ cp .env.example .env Within `.env`, replace the necessary environment variables: ```bash -LEAPFROGAI_API_URL= +LEAPFROGAI_API_URL= LEAPFROGAI_API_KEY= ANTHROPIC_API_KEY= ``` diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py index 3813474f9..e57497f91 100644 --- a/src/leapfrogai_evals/main.py +++ b/src/leapfrogai_evals/main.py @@ -61,5 +61,5 @@ def run_evals(self, *args, **kwargs) -> None: logging.basicConfig(level=logging.INFO) load_dotenv() evaluator = RAGEvaluator() - evaluator.set_evaluations() + evaluator.set_evaluations(eval_list=["mmlu"]) evaluator.run_evals() diff --git a/src/leapfrogai_evals/models/lfai.py b/src/leapfrogai_evals/models/lfai.py index fca7c8de4..88cc65741 100644 --- a/src/leapfrogai_evals/models/lfai.py +++ b/src/leapfrogai_evals/models/lfai.py @@ -24,7 +24,7 @@ def __init__( ): self.model = model or os.getenv("MODEL_TO_EVALUATE") self.api_key = api_key or os.getenv("LEAPFROGAI_API_KEY") - self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + self.base_url = base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1" self.client = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) def load_model(self): diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py index 99bfae069..4f711bdff 100644 --- a/src/leapfrogai_evals/runners/niah_runner.py +++ b/src/leapfrogai_evals/runners/niah_runner.py @@ -80,7 +80,7 @@ def __init__( ) self.client = openai.OpenAI( - base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"), + base_url=base_url or os.environ.get("LEAPFROGAI_API_URL") + "/openai/v1", api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"), ) logging.info(f"client url: {self.client.base_url}") @@ -178,8 +178,8 @@ def run_experiment(self, cleanup: bool = True) -> None: for chunk_num, chunk_id in enumerate(chunk_ids): logging.info(f"chunk {chunk_num} (id: {chunk_id})") vector_response = requests.get( - url=os.getenv("LEAPFROGAI_API_LFAI_URL") - + "/vector_stores/vector/" + url=os.getenv("LEAPFROGAI_API_URL") + + "/leapfrogai/v1/vector_stores/vector/" + chunk_id, headers={ "accept": "application/json", diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py index f06dbb303..cd297689e 100644 --- a/src/leapfrogai_evals/runners/qa_runner.py +++ b/src/leapfrogai_evals/runners/qa_runner.py @@ -75,7 +75,7 @@ def __init__( ) self.client = openai.OpenAI( - base_url=base_url or os.getenv("LEAPFROGAI_API_URL"), + base_url=base_url or os.getenv("LEAPFROGAI_API_URL") + "/openai/v1", api_key=self.api_key, ) logging.info(f"client url: {self.client.base_url}") @@ -145,8 +145,8 @@ def run_experiment(self) -> None: # retrieve context used to generate response for chunk_id in chunk_ids: vector_response = requests.get( - url=os.getenv("LEAPFROGAI_API_LFAI_URL") - + "/vector_stores/vector/" + url=os.getenv("LEAPFROGAI_API_URL") + + "/leapfrogai/v1/vector_stores/vector/" + chunk_id, headers={ "accept": "application/json", From bdbde55d71562d58392d3c484b5084a2ace78f35 Mon Sep 17 00:00:00 2001 From: John Alling Date: Fri, 4 Oct 2024 11:25:49 -0400 Subject: [PATCH 8/9] update .env.example --- src/leapfrogai_evals/.env.example | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example index 7ae533c01..9a9370e29 100644 --- a/src/leapfrogai_evals/.env.example +++ b/src/leapfrogai_evals/.env.example @@ -1,5 +1,4 @@ -LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1" -LEAPFROGAI_API_LFAI_URL=https://leapfrogai-api.uds.dev/leapfrogai/v1 +LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev" LEAPFROGAI_API_KEY="lfai-api-key" ANTHROPIC_API_KEY="anthropic-api-key" From 855cec51ec9c1561a93ab19c063921d344a1a40c Mon Sep 17 00:00:00 2001 From: John Alling Date: Fri, 4 Oct 2024 11:33:22 -0400 Subject: [PATCH 9/9] remove eval_list from main --- src/leapfrogai_evals/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py index e57497f91..3813474f9 100644 --- a/src/leapfrogai_evals/main.py +++ b/src/leapfrogai_evals/main.py @@ -61,5 +61,5 @@ def run_evals(self, *args, **kwargs) -> None: logging.basicConfig(level=logging.INFO) load_dotenv() evaluator = RAGEvaluator() - evaluator.set_evaluations(eval_list=["mmlu"]) + evaluator.set_evaluations() evaluator.run_evals()