From 3e5f1e0dd60fc451b3462c617c2a351e6a34467f Mon Sep 17 00:00:00 2001
From: John Alling <44934218+jalling97@users.noreply.github.com>
Date: Tue, 17 Sep 2024 17:29:14 -0400
Subject: [PATCH] feat: add LLM as judge evaluations (#960)

* add claude llm judge model
* add correctness and annotation relevancy metrics
* update root README and evals README
* add QA eval runner
* make eval runners more flexible with env vars
---
 README.md                                     |   6 +
 src/leapfrogai_evals/.env.example             |  28 ++
 src/leapfrogai_evals/README.md                |  50 ++-
 src/leapfrogai_evals/judges/__init__.py       |   0
 src/leapfrogai_evals/judges/claude_sonnet.py  |  55 ++++
 src/leapfrogai_evals/main.py                  | 138 ++++++--
 .../metrics/annotation_relevancy.py           |  65 ++++
 src/leapfrogai_evals/metrics/correctness.py   |  23 ++
 src/leapfrogai_evals/pyproject.toml           |   7 +-
 src/leapfrogai_evals/runners/niah_runner.py   |  52 +--
 src/leapfrogai_evals/runners/qa_runner.py     | 304 ++++++++++++++++++
 src/leapfrogai_evals/utils/__init__.py        |   0
 src/leapfrogai_evals/utils/defaults.py        |   6 +
 13 files changed, 689 insertions(+), 45 deletions(-)
 create mode 100644 src/leapfrogai_evals/.env.example
 create mode 100644 src/leapfrogai_evals/judges/__init__.py
 create mode 100644 src/leapfrogai_evals/judges/claude_sonnet.py
 create mode 100644 src/leapfrogai_evals/metrics/annotation_relevancy.py
 create mode 100644 src/leapfrogai_evals/metrics/correctness.py
 create mode 100644 src/leapfrogai_evals/runners/qa_runner.py
 create mode 100644 src/leapfrogai_evals/utils/__init__.py
 create mode 100644 src/leapfrogai_evals/utils/defaults.py

diff --git a/README.md b/README.md
index cdac6e8c2..7c09b075b 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@
   - [UI](#ui)
   - [Backends](#backends)
     - [Repeater](#repeater)
+  - [Evaluations](#evaluations)
 - [Usage](#usage)
 - [Local Development](#local-development)
 - [Contributing](#contributing)
@@ -58,6 +59,7 @@ The LeapfrogAI repository follows a monorepo structure based around an [API](#ap
 leapfrogai/
 ├── src/
 │   ├── leapfrogai_api/   # source code for the API
+│   ├── leapfrogai_evals/ # source code for the LeapfrogAI evaluation framework
 │   ├── leapfrogai_sdk/   # source code for the SDK
 │   └── leapfrogai_ui/    # source code for the UI
 ├── packages/
@@ -115,6 +117,10 @@ LeapfrogAI provides several backends for a variety of use cases. Below is the ba
 
 The [repeater](packages/repeater/) "model" is a basic "backend" that parrots all inputs it receives back to the user. It is built out the same way all the actual backends are and it is primarily used for testing the API.
 
+### Evaluations
+
+LeapfrogAI comes with an evaluation framework that is integrated with [DeepEval](https://docs.confident-ai.com/). For more information on running and utilizing evaluations in LeapfrogAI, please see the [Evals README](/src/leapfrogai_evals/README.md).
+
 ### Flavors
 
 Each component has different images and values that refer to a specific image registry and/or hardening source. These images are packaged using [Zarf Flavors](https://docs.zarf.dev/ref/examples/package-flavors/):
diff --git a/src/leapfrogai_evals/.env.example b/src/leapfrogai_evals/.env.example
new file mode 100644
index 000000000..cfc928bc3
--- /dev/null
+++ b/src/leapfrogai_evals/.env.example
@@ -0,0 +1,28 @@
+LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1"
+LEAPFROGAI_API_KEY="lfai-api-key"
+ANTHROPIC_API_KEY="anthropic-api-key"
+
+# ---- hyperparameters ----
+# general
+MODEL_TO_EVALUATE=vllm
+TEMPERATURE=0.1
+LLM_JUDGE=ClaudeSonnet
+
+# Needle in a Haystack
+NIAH_DATASET=defenseunicorns/LFAI_RAG_niah_v1
+NIAH_ADD_PADDING=True
+NIAH_MESSAGE_PROMPT="What is the secret code?"
+NIAH_INSTRUCTION_TEMPLATE=DEFAULT_INSTRUCTION_TEMPLATE # this can be either a global or a string
+NIAH_MIN_DOC_LENGTH=4096
+NIAH_MAX_DOC_LENGTH=4096
+NIAH_MIN_DEPTH=0.0
+NIAH_MAX_DEPTH=1.0
+NIAH_NUM_COPIES=2
+
+# Question & Answering
+QA_DATASET=defenseunicorns/LFAI_RAG_qa_v1
+QA_INSTRUCTION_TEMPLATE=DEFAULT_INSTRUCTION_TEMPLATE # this can be either a global or a string
+QA_NUM_SAMPLES=25
+QA_NUM_DOCUMENTS=5
+#QA_VECTOR_STORE_ID= # set this to a vectore store id if you want to use an already existing vector store with the files present
+QA_CLEANUP_VECTOR_STORE=True # recommend setting this to False if a vector store id is provided
diff --git a/src/leapfrogai_evals/README.md b/src/leapfrogai_evals/README.md
index 53314f1bd..fb492113c 100644
--- a/src/leapfrogai_evals/README.md
+++ b/src/leapfrogai_evals/README.md
@@ -7,13 +7,20 @@ The LeapfrogAI RAG evaluation system assumes the following:
 
 - LeapfrogAI is deployed
 - A valid LeapfrogAI API key is set (for more info, see the [API README](/src/leapfrogai_api/README.md))
+- For all LLM-enabled metrics, a valid Anthropic API key is set
 
-Set the following environment variables:
+For the easiest setup, copy the `.env.example` file:
+
+```bash
+cp .env.example .env
+```
+
+Within `.env`, replace the necessary environment variables:
 
 ```bash
 LEAPFROGAI_API_URL=<LeapfrogAI API url, usually: https://leapfrogai-api.uds.dev/openai/v1 for development>
 LEAPFROGAI_API_KEY=<LeapfrogAI API key>
-MODEL_TO_EVALUATE="vllm" # can also be provided as "model" to the __init__ for the runner
+ANTHROPIC_API_KEY=<Anthropic API key>
 ```
 
 Running `main.py` will by default run all of the evaluations currently available:
@@ -24,6 +31,45 @@ python -m pip install .
 python main.py
 ```
 
+## Question/Answer Evaluation
+
+Question and answer pairs are a valuable setup for evaluating LLM systems as a hole. Within LeapfrogAI, this type of evaluation takes an input question, expected context, and expected output, and compares them to the retrieved context from RAG and the system's final output.
+
+### Data
+The LeapfrogAI QA evaluation uses a custom dataset available on HuggingFace: [defenseunicorns/LFAI_RAG_qa_v1](https://huggingface.co/datasets/defenseunicorns/LFAI_RAG_qa_v1)
+
+LFAI_RAG_qa_v1 contains 36 question/answer/context entries that are intended to be used for LLM-as-a-judge enabled RAG Evaluations.
+
+Example:
+
+```json
+{
+    "input": "What requirement must be met to run VPI PVA algorithms in a Docker container?",
+    "actual_output": null,
+    "expected_output": "To run VPI PVA algorithms in a Docker container, the same VPI version must be installed on the Docker host.",
+    "context": [
+        "2.6.\nCompute\nStack\nThe\nfollowing\nDeep\nLearning-related\nissues\nare\nnoted\nin\nthis\nrelease.\nIssue\nDescription\n4564075\nTo\nrun\nVPI\nPVA\nalgorithms\nin\na\ndocker\ncontainer,\nthe\nsame\nVPI\nversion\nhas\nto\nbe\ninstalled\non \nthe\ndocker\nhost.\n2.7.\nDeepstream\nIssue\nDescription\n4325898\nThe\npipeline\ngets\nstuck\nfor\nmulti\u0000lesrc\nwhen\nusing\nnvv4l2decoder.\nDS\ndevelopers\nuse \nthe\npipeline\nto\nrun\ndecode\nand\ninfer\njpeg\nimages.\nNVIDIA\nJetson\nLinux\nRelease\nNotes\nRN_10698-r36.3\n|\n11"
+    ],
+    "source_file": "documents/Jetson_Linux_Release_Notes_r36.3.pdf"
+}
+```
+
+### Experimental Design
+The LeapfrogAI QA evaluation uses the following process:
+
+- build a vector store and upload the contextual documents from the qa dataset
+- for each row in the dataset:
+    - create an assistant
+    - prompt the LLM to answer the input question using the contextual documents
+    - record the following:
+        - the model response
+        - the retrieved context from RAG
+    - delete the assistant
+- delete the contextless documents
+- delete the vector store
+
+Various metrics can then be calculated using these individual pieces.
+
 ## Needle in a Haystack (NIAH)
 
 A Needle in a Haystack evaluation is used to evaluate the performance of the LeapfrogAI RAG system in tasks that require finding a specific piece of information (the "needle") within a large body of text (the "haystack").
diff --git a/src/leapfrogai_evals/judges/__init__.py b/src/leapfrogai_evals/judges/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/leapfrogai_evals/judges/claude_sonnet.py b/src/leapfrogai_evals/judges/claude_sonnet.py
new file mode 100644
index 000000000..d7ad0b5ab
--- /dev/null
+++ b/src/leapfrogai_evals/judges/claude_sonnet.py
@@ -0,0 +1,55 @@
+import os
+
+import instructor
+from pydantic import BaseModel
+from deepeval.models.base_model import DeepEvalBaseLLM
+import asyncio
+from anthropic import Anthropic
+from typing import Optional
+
+
+class ClaudeSonnet(DeepEvalBaseLLM):
+    """A DeepEval LLM class that utilizes the Anthropic API to utilize Claude models"""
+
+    def __init__(
+        self, api_key: Optional[str] = None, model: str = "claude-3-5-sonnet-20240620"
+    ):
+        self.model = model
+        self.client = Anthropic(api_key=api_key or os.environ.get("ANTHROPIC_API_KEY"))
+
+    def load_model(self):
+        """Returns the current model selected"""
+        return self.model
+
+    def generate(
+        self,
+        prompt: str,
+        schema: BaseModel,
+        max_tokens: int = 1024,
+    ) -> BaseModel:
+        """Generates a response from the Anthropic API"""
+        instructor_client = instructor.from_anthropic(self.client)
+        response = instructor_client.messages.create(
+            model=self.model,
+            max_tokens=max_tokens,
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
+            response_model=schema,
+        )
+        return response
+
+    async def a_generate(
+        self, prompt: str, schema: BaseModel, *args, **kwargs
+    ) -> BaseModel:
+        """Async implementation of the generate function"""
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(
+            None, self.generate, prompt, schema, *args, **kwargs
+        )
+
+    def get_model_name(self):
+        return f"Anthropic {self.model}"
diff --git a/src/leapfrogai_evals/main.py b/src/leapfrogai_evals/main.py
index 1ca7772bc..a32daaa66 100644
--- a/src/leapfrogai_evals/main.py
+++ b/src/leapfrogai_evals/main.py
@@ -1,40 +1,73 @@
-import deepeval
 from deepeval.test_case import LLMTestCase
+from deepeval.metrics import AnswerRelevancyMetric
+
 import logging
+import numpy as np
+import os
+from dotenv import load_dotenv
+import time
+from typing import Optional, List
 
-from leapfrogai_evals.runners.niah_runner import NIAH_Runner
+from leapfrogai_evals.judges.claude_sonnet import ClaudeSonnet  # noqa
+from leapfrogai_evals.metrics.annotation_relevancy import AnnotationRelevancyMetric
+from leapfrogai_evals.metrics.correctness import CorrectnessMetric
 from leapfrogai_evals.metrics.niah_metrics import NIAH_Retrieval, NIAH_Response
+from leapfrogai_evals.runners.niah_runner import NIAH_Runner
+from leapfrogai_evals.runners.qa_runner import QA_Runner
 
-ALL_EVALS = ["LFAI_NIAH"]
+ALL_EVALS = ["niah_eval", "qa_eval"]
 
 
 class RAGEvaluator:
     """A class that handles running all of the LeapfrogAI RAG evaluations"""
 
-    def __init__(self):
-        self.eval_list = None
+    def __init__(
+        self,
+        eval_list: Optional[List[str]] = None,
+    ):
+        self.eval_list = eval_list
         self.test_case_dict = None
         self.niah_test_cases = None
-        self.eval_options = ALL_EVALS
+        self.eval_results = dict()
 
-    def set_evaluations(self, evals_list=[]) -> None:
+    def set_evaluations(self, eval_list: List[str] = None) -> None:
         """Set the evaluations that will be run via a list"""
-        if len(evals_list) == 0:
+        if not eval_list:
             logging.info("Setting eval list to ALL")
             self.eval_list = ALL_EVALS
-        # TODO: Add other evals options
+        else:
+            for item in eval_list:
+                if item not in ALL_EVALS:
+                    raise AttributeError(
+                        f"'{item}' is not an available evaluation. Please limit the list to one of the following: {ALL_EVALS}"
+                    )
+            self.eval_list = eval_list
 
     def run_evals(self, *args, **kwargs) -> None:
         """Run all of the selected evaluations"""
+        if self.eval_list is None:
+            raise AttributeError(
+                "the list of evaluations has not been set. Please do so by running the 'set_evaluations()' function"
+            )
+
         logging.info("Running the following evaluations:")
-        for eval in self.eval_list:
-            logging.info(f" -{eval}")
-        if "LFAI_NIAH" in self.eval_list:
-            self._niah_evaluation(*args, **kwargs)
-        # TODO: add more evaluations
+        logging.info("".join([f"\n - {eval_name}" for eval_name in self.eval_list]))
 
-    def _niah_evaluation(self, *args, **kwargs) -> None:
+        start_time = time.time()
+        for eval_name in self.eval_list:
+            eval = getattr(self, eval_name)
+            eval(*args, **kwargs)
+        end_time = time.time()
+
+        self.eval_results["Eval Execution Runtime (seconds)"] = end_time - start_time
+
+        logging.info("\n\nFinal Results:")
+        for key, value in self.eval_results.items():
+            logging.info(f"{key}: {value}")
+
+    def niah_eval(self, *args, **kwargs) -> None:
         """Run the Needle in a Haystack evaluation"""
+        logging.info("Beginning Needle in a Haystack Evaluation...")
         self.niah_test_cases = []
 
         niah_runner = NIAH_Runner(*args, **kwargs)
@@ -55,16 +88,85 @@ def _niah_evaluation(self, *args, **kwargs) -> None:
             )
 
         # run metrics
+        # TODO: Give ability to choose which metrics to run
         retrieval_metric = NIAH_Retrieval()
         response_metric = NIAH_Response()
+        metrics = [retrieval_metric, response_metric]
+
+        for metric in metrics:
+            scores = []
+            successes = []
+            for test_case in self.niah_test_cases:
+                metric.measure(test_case)
+                scores.append(metric.score)
+                successes.append(metric.is_successful())
+            self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
+            logging.info(f"{metric.__name__} Results:")
+            logging.info(f"average score: {np.mean(scores)}")
+            logging.info(f"scores: {scores}")
+            logging.info(f"successes: {successes}")
+
+    def qa_eval(self, *args, **kwargs) -> None:
+        """Runs the Question/Answer evaluation"""
+        logging.info("Beginning Question/Answer Evaluation...")
+        self.qa_test_cases = []
+
+        qa_runner = QA_Runner(*args, **kwargs)
+        qa_runner.run_experiment()
+
+        # build test cases out of the qa_dataset
+        for row in qa_runner.qa_data:
+            self.qa_test_cases.append(
+                LLMTestCase(
+                    input=row["input"],
+                    actual_output=row["actual_output"],
+                    context=row["context"],
+                    expected_output=row["expected_output"],
+                    additional_metadata={
+                        "actual_annotations": row["actual_annotations"],
+                        "expected_annotations": row["expected_annotations"],
+                    },
+                    # retrieval_context = row['retrieval_context'] # TODO: add this for more metrics
+                )
+            )
+
+        # Create judge llm
+        try:
+            judge_model = globals()[os.environ.get("LLM_JUDGE")]()
+        except KeyError:
+            judge_model = os.environ.get("LLM_JUDGE")
+
+        # run metrics
+        # TODO: Give ability to choose which metrics to run
+        correctness_metric = CorrectnessMetric(model=judge_model)
+        answer_relevancy_metric = AnswerRelevancyMetric(model=judge_model)
+        annotation_relevancy_metric = AnnotationRelevancyMetric()
+        metrics = [
+            correctness_metric,
+            answer_relevancy_metric,
+            annotation_relevancy_metric,
+        ]
 
-        deepeval.evaluate(
-            test_cases=self.niah_test_cases, metrics=[retrieval_metric, response_metric]
-        )
+        for metric in metrics:
+            scores = []
+            successes = []
+            reasons = []
+            for test_case in self.qa_test_cases:
+                metric.measure(test_case)
+                scores.append(metric.score)
+                successes.append(metric.is_successful())
+                reasons.append(metric.reason)
+            self.eval_results[f"Average {metric.__name__}"] = np.mean(scores)
+            logging.info(f"{metric.__name__} Results:")
+            logging.info(f"average score: {np.mean(scores)}")
+            logging.info(f"scores: {scores}")
+            logging.info(f"successes: {successes}")
+            logging.info(f"reasons: {reasons}")
 
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    load_dotenv()
     evaluator = RAGEvaluator()
     evaluator.set_evaluations()
     evaluator.run_evals()
diff --git a/src/leapfrogai_evals/metrics/annotation_relevancy.py b/src/leapfrogai_evals/metrics/annotation_relevancy.py
new file mode 100644
index 000000000..c35e828a5
--- /dev/null
+++ b/src/leapfrogai_evals/metrics/annotation_relevancy.py
@@ -0,0 +1,65 @@
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+import asyncio
+
+
+class AnnotationRelevancyMetric(BaseMetric):
+    """A heuristic (non-LLM) metric for measuring how relevant the annotated documents are to the needed context"""
+
+    def __init__(
+        self,
+        threshold: float = 0.75,
+        async_mode: bool = True,
+    ):
+        self.threshold = threshold
+        self.async_mode = async_mode
+
+    def measure(self, test_case: LLMTestCase) -> int:
+        """
+        Calculates the number of relevant annotations out of the total annotations
+
+        This function calculates a simple fraction of the number of relevant annotations (usually 1)
+        divided by the total number of annotations (number of documents referenced) returned by RAG.
+        An annotation is considered relevant if it is in the listed of provided annotations and is expected
+
+        score = # of relevant annotations / # of total annotations
+
+        params:
+        -------
+        test_case: LLMTestCase
+            A test case object built from the results of a question/answer evaluation run.
+            test_case should contain an additional metadata field that returns a dictionary with
+            the fields "expected_annotations" and "actual_annotations" which both contain lists of strings (file ids)
+
+        returns:
+        -------
+        float
+            A score from 0-1 that represents the fraction of relevant annotations out of all annotations
+        """
+
+        relevant_annotations = 0
+        total_annotations = len(test_case.additional_metadata["actual_annotations"])
+        for annotation in test_case.additional_metadata["actual_annotations"]:
+            if annotation in test_case.additional_metadata["expected_annotations"]:
+                relevant_annotations += 1
+
+        self.score = float(relevant_annotations / total_annotations)
+        self.success = self.score >= self.threshold
+
+        if self.success:
+            self.reason = f"The fraction of relevant annotations out of the total number of annotations ({self.score}) is greater than or equal to the threshold of {self.threshold}"
+        else:
+            self.reason = f"The fraction of relevant annotations out of the total number of annotations ({self.score}) is less than the threshold of {self.threshold}"
+
+        return self.score
+
+    async def a_measure(self, test_case: LLMTestCase) -> int:
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.measure, test_case)
+
+    def is_successful(self) -> bool:
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Annotation Relevancy"
diff --git a/src/leapfrogai_evals/metrics/correctness.py b/src/leapfrogai_evals/metrics/correctness.py
new file mode 100644
index 000000000..c7ef54e82
--- /dev/null
+++ b/src/leapfrogai_evals/metrics/correctness.py
@@ -0,0 +1,23 @@
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCaseParams
+
+
+class CorrectnessMetric(GEval):
+    """A custom metric that determines if the actual output matches an expected output"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(
+            name="Correctness",
+            criteria="Determine whether the actual output is factually correct based on the expected output.",
+            evaluation_steps=[
+                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+                "You should also heavily penalize omission of detail",
+                "Vague language, or contradicting OPINIONS, are OK",
+            ],
+            evaluation_params=[
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+                LLMTestCaseParams.EXPECTED_OUTPUT,
+            ],
+            *args,
+            **kwargs,
+        )
diff --git a/src/leapfrogai_evals/pyproject.toml b/src/leapfrogai_evals/pyproject.toml
index 07e2129ed..0590c0fc3 100644
--- a/src/leapfrogai_evals/pyproject.toml
+++ b/src/leapfrogai_evals/pyproject.toml
@@ -7,13 +7,16 @@ version = "0.12.2"
 # x-release-please-end
 
 dependencies = [
-    "deepeval == 1.1.1",
+    "deepeval == 1.1.6",
     "openai == 1.42.0",
     "tqdm == 4.66.5",
     "python-dotenv == 1.0.1",
     "seaborn == 0.13.2",
     "datasets == 2.21.0",
-    "huggingface-hub == 0.24.6"
+    "huggingface-hub == 0.24.6",
+    "anthropic == 0.34.1",
+    "instructor == 1.4.0",
+    "pyPDF2 == 3.0.1"
 ]
 requires-python = "~=3.11"
 readme = "README.md"
diff --git a/src/leapfrogai_evals/runners/niah_runner.py b/src/leapfrogai_evals/runners/niah_runner.py
index e36b46ccc..e200a604a 100644
--- a/src/leapfrogai_evals/runners/niah_runner.py
+++ b/src/leapfrogai_evals/runners/niah_runner.py
@@ -4,20 +4,13 @@
 import openai
 
 from datasets import load_dataset, concatenate_datasets
-from dotenv import load_dotenv
+from distutils.util import strtobool
 from tqdm import tqdm
 
 from openai.types.beta.assistant import Assistant
 from openai.types.beta.vector_store import VectorStore
 
-load_dotenv()
-
-DEFAULT_INSTRUCTION_TEMPLATE = """
-                You are a helpful AI bot that answers questions for a user. Keep your response short and direct.
-                You will receive a set of context and a question that will relate to the context.
-                Do not give information outside the document or repeat your findings.
-                If the information is not available in the context respond UNANSWERABLE.
-                """
+from utils.defaults import DEFAULT_INSTRUCTION_TEMPLATE
 
 
 class NIAH_Runner:
@@ -50,14 +43,14 @@ def __init__(
         add_padding: bool = True,
         base_url: str = None,
         api_key: str = None,
-        model: str = None,
-        message_prompt: str = "What is Doug's secret code?",
+        model: str = "vllm",
+        message_prompt: str = "What is the secret code?",
         instruction_template: str = DEFAULT_INSTRUCTION_TEMPLATE,
         min_doc_length: int = 4096,
         max_doc_length: int = 4096,
         min_depth: float = 0.0,
         max_depth: float = 1.0,
-        num_copies: int = 2,
+        num_copies: int = 3,
     ):
         """Initialize the Assistant with an API key and the path to the text file"""
 
@@ -66,23 +59,36 @@ def __init__(
         self.vector_store = None
         self.current_file = None
         self.current_assistant = None
-        self.message_prompt = message_prompt
-        self.instruction_template = instruction_template
-        self.model = model or os.environ.get("MODEL_TO_EVALUATE")
-        self.temperature = temperature
-        self.add_padding = add_padding
+        self.message_prompt = os.environ.get("NIAH_MESSAGE_PROMPT", message_prompt)
+        self.model = os.environ.get("MODEL_TO_EVALUATE", model)
+        self.temperature = float(os.environ.get("TEMPERATURE", temperature))
+        self.add_padding = (
+            bool(strtobool(os.environ.get("NIAH_ADD_PADDING")))
+            if os.environ.get("NIAH_ADD_PADDING") is not None
+            else add_padding
+        )
+        try:
+            self.instruction_template = globals()[
+                os.environ.get("NIAH_INSTRUCTION_TEMPLATE")
+            ]
+        except KeyError:
+            logging.debug("Instruction template not in globals; setting as a string")
+            self.instruction_template = os.environ.get(
+                "NIAH_INSTRUCTION_TEMPLATE", instruction_template
+            )
+
         self.client = openai.OpenAI(
             base_url=base_url or os.environ.get("LEAPFROGAI_API_URL"),
             api_key=api_key or os.environ.get("LEAPFROGAI_API_KEY"),
         )
         logging.info(f"client url: {self.client.base_url}")
         self._load_niah_dataset(
-            dataset,
-            min_doc_length=min_doc_length,
-            max_doc_length=max_doc_length,
-            min_depth=min_depth,
-            max_depth=max_depth,
-            num_copies=num_copies,
+            dataset_name=os.environ.get("NIAH_DATASET", dataset),
+            min_doc_length=int(os.environ.get("NIAH_MIN_DOC_LENGTH", min_doc_length)),
+            max_doc_length=int(os.environ.get("NIAH_MAX_DOC_LENGTH", max_doc_length)),
+            min_depth=float(os.environ.get("NIAH_MIN_DEPTH", min_depth)),
+            max_depth=float(os.environ.get("NIAH_MAX_DEPTH", max_depth)),
+            num_copies=int(os.environ.get("NIAH_NUM_COPIES", num_copies)),
         )
         self._create_vector_store()
         self.retrieval_score = None
diff --git a/src/leapfrogai_evals/runners/qa_runner.py b/src/leapfrogai_evals/runners/qa_runner.py
new file mode 100644
index 000000000..4875e2ff8
--- /dev/null
+++ b/src/leapfrogai_evals/runners/qa_runner.py
@@ -0,0 +1,304 @@
+import logging
+import os
+import openai
+import shutil
+import zipfile
+
+from datasets import load_dataset
+from distutils.util import strtobool
+from huggingface_hub import hf_hub_download
+from tqdm import tqdm
+
+from openai.types.beta.assistant import Assistant
+from openai.types.beta.vector_store import VectorStore
+
+from utils.defaults import DEFAULT_INSTRUCTION_TEMPLATE
+
+
+class QA_Runner:
+    """
+    A runner to handle executing Question and Answer (QA) evals for LeapfrogAI
+
+    This runner assumes LeapfrogAI is already deployed
+
+    The evaluation takes the following steps (by default)
+    - Creates a vector store
+    - Uploads the contextual documents needed to answer the questions in the dataset
+    - For each question in the dataset:
+        - create an assistant
+        - prompt the system to answer the question
+        - record the response
+        - delete the assistant
+    - delete documents
+    - delete the vector store
+    """
+
+    def __init__(
+        self,
+        dataset: str = "defenseunicorns/LFAI_RAG_qa_v1",
+        model: str = "vllm",
+        temperature: float = 0.1,
+        base_url: str = None,
+        api_key: str = None,
+        num_samples: int = 32,
+        num_documents: int = 5,
+        instruction_template: str = DEFAULT_INSTRUCTION_TEMPLATE,
+        vector_store_id: str = None,
+        cleanup: bool = True,
+    ):
+        """Initialize the Assistant with an API key and the path to the text file"""
+
+        self.qa_data = None
+        self.vector_store = None
+        self.file_dict = None
+        self.current_assistant = None
+        self.dataset_name = os.environ.get("QA_DATASET", dataset)
+        self.model = os.environ.get("MODEL_TO_EVALUATE", model)
+        self.temperature = float(os.environ.get("TEMPERATURE", temperature))
+        self.num_documents = int(os.environ.get("QA_NUM_DOCUMENTS", num_documents))
+        self.cleanup_after = (
+            bool(strtobool(os.environ.get("QA_CLEANUP_VECTOR_STORE")))
+            if os.environ.get("QA_CLEANUP_VECTOR_STORE") is not None
+            else cleanup
+        )
+        try:
+            self.instruction_template = globals()[
+                os.environ.get("QA_INSTRUCTION_TEMPLATE")
+            ]
+        except KeyError:
+            logging.debug("Instruction template not in globals; setting as a string")
+            self.instruction_template = os.environ.get(
+                "QA_INSTRUCTION_TEMPLATE", instruction_template
+            )
+
+        self.client = openai.OpenAI(
+            base_url=base_url or os.getenv("LEAPFROGAI_API_URL"),
+            api_key=api_key or os.getenv("LEAPFROGAI_API_KEY"),
+        )
+        logging.info(f"client url: {self.client.base_url}")
+        try:  # use existing vector store if supplied
+            self.vector_store = self._get_vector_store(
+                os.environ.get("QA_VECTOR_STORE_ID", vector_store_id)
+            )
+        except Exception:  # otherwise create a new one
+            self.vector_store = self._create_vector_store()
+        if not os.environ.get("QA_VECTOR_STORE_ID") and not vector_store_id:
+            self._upload_context(
+                dataset_name=self.dataset_name, num_documents=self.num_documents
+            )
+        self._load_qa_dataset(
+            dataset_name=self.dataset_name,
+            num_samples=int(os.environ.get("QA_NUM_SAMPLES", num_samples)),
+        )
+
+    def run_experiment(self) -> None:
+        """Prompts LFAI to answer questions from the QA dataset"""
+        if self.cleanup_after:
+            logging.info(
+                "By default, all files and the vector store will be deleted after running the experiment. \
+                         Please set `self.cleanup_after` to false if this is not preferred."
+            )
+
+        try:
+            response_contents = []
+            expected_annotations = []
+            actual_annotations = []
+
+            for row in tqdm(self.qa_data, desc="Evaluating data rows"):
+                # create assistant
+                self.current_assistant = self._create_assistant()
+
+                # create thread
+                thread = self.client.beta.threads.create()
+                self.client.beta.threads.messages.create(
+                    thread_id=thread.id,
+                    role="user",
+                    content=row["input"],
+                )
+
+                # create run
+                run = self.client.beta.threads.runs.create_and_poll(
+                    assistant_id=self.current_assistant.id, thread_id=thread.id
+                )
+
+                # get messages
+                messages = self.client.beta.threads.messages.list(
+                    thread_id=thread.id, run_id=run.id
+                ).data
+
+                response_messages = []
+                for message in messages:
+                    if message.role == "assistant":
+                        response_messages.append(message)
+
+                response_content = ""
+                response_annotations = []
+                for response in response_messages:
+                    response_content += response.content[0].text.value + "\n"
+
+                    for annotation in response.content[0].text.annotations:
+                        annotation_id = annotation.file_citation.file_id
+                        response_annotations.append(annotation_id)
+
+                    logging.debug(
+                        f"number of annotations in response: {len(response.content[0].text.annotations)}"
+                    )
+
+                expected_annotations.append([self.file_dict[row["source_file"]]])
+                actual_annotations.append(response_annotations)
+
+                logging.info(f"Response recorded:\n{response_content}")
+                response_contents.append(response_content)
+
+                # delete the assistant
+                self._delete_assistant(self.current_assistant.id)
+                self.current_assistant = None
+
+            # set the responses
+            self.qa_data = self.qa_data.remove_columns("actual_output")
+            self.qa_data = self.qa_data.add_column(
+                name="actual_output", column=response_contents
+            )
+            self.qa_data = self.qa_data.add_column(
+                name="expected_annotations", column=expected_annotations
+            )
+            self.qa_data = self.qa_data.add_column(
+                name="actual_annotations", column=actual_annotations
+            )
+
+            if self.cleanup_after:
+                self.cleanup()
+
+        # remove artifacts from the API if the experiment fails
+        except Exception as exc:
+            logging.info("Error encountered, running cleanup")
+            self.cleanup()
+            raise exc
+
+    def cleanup(self) -> None:
+        """
+        Deletes the vector store and any remaining uploaded files
+
+        This is run by default after completing a run and in case a run fails
+        """
+        logging.info("Cleaning up runtime artifacts...")
+        if self.current_assistant:
+            self._delete_assistant(assistant_id=self.current_assistant.id)
+            self.current_assistant = None
+        if self.file_dict:
+            self._delete_context()
+            self.file_dict = None
+        if self.vector_store:
+            self._delete_vector_store(vector_store_id=self.vector_store.id)
+            self.vector_store = None
+
+    def _load_qa_dataset(self, dataset_name: str, num_samples: int):
+        """
+        Load the Defense Unicorns LFAI QA dataset with the requested constraints
+
+        By default, the dataset will contain 32 elements
+        """
+        logging.info(f"Downloading dataset: {dataset_name} from HuggingFace")
+        qa_dataset = load_dataset(dataset_name)["eval"]
+        qa_dataset = qa_dataset.select(
+            (
+                i
+                for i in range(len(qa_dataset))
+                if (qa_dataset[i]["source_file"] in self.doc_list)
+            )
+        )
+
+        logging.info(f"Dataset downloaded: \n{qa_dataset}")
+        if num_samples < len(qa_dataset):
+            qa_dataset = qa_dataset.select(range(num_samples))
+
+        self.qa_data = qa_dataset
+
+    def _create_assistant(self) -> Assistant:
+        """Create an assistant for running the QA evaluation"""
+        logging.info("Creating new assistant...")
+        assistant = self.client.beta.assistants.create(
+            name="LFAI QA Assistant",
+            instructions=self.instruction_template,
+            model=self.model,
+            temperature=self.temperature,
+            tools=[{"type": "file_search"}],
+            tool_resources={
+                "file_search": {"vector_store_ids": [self.vector_store.id]}
+            },
+        )
+        return assistant
+
+    def _delete_assistant(self, assistant_id: str) -> None:
+        """Deletes the current assistant"""
+        logging.info("deleting assistant...")
+        self.client.beta.assistants.delete(assistant_id=assistant_id)
+        pass
+
+    def _create_vector_store(self) -> VectorStore:
+        logging.info("Creating vector store...")
+        vector_store = self.client.beta.vector_stores.create(
+            name="Question/Answer Store",
+            file_ids=[],
+            expires_after={"anchor": "last_active_at", "days": 1},
+            metadata={"project": "QA Evaluation", "version": "0.1"},
+        )
+        return vector_store
+
+    def _get_vector_store(self, vector_store_id: str) -> VectorStore:
+        logging.info("Retrieving vector store...")
+        vector_store = self.client.beta.vector_stores.retrieve(
+            vector_store_id=vector_store_id
+        )
+        return vector_store
+
+    def _delete_vector_store(self, vector_store_id: str) -> None:
+        """Deletes the vector store used for all QA evaluations"""
+        logging.info("Deleting vector store...")
+        _ = self.client.beta.vector_stores.delete(vector_store_id=vector_store_id)
+        self.vector_store = None
+
+    def _upload_context(
+        self, dataset_name: str, num_documents: int | None = None
+    ) -> None:
+        """Uploads the full-text context documents to the vector store"""
+        self.file_dict = dict()
+        zip_path = hf_hub_download(
+            repo_id=dataset_name, repo_type="dataset", filename="documents_partial.zip"
+        )
+        # make a temporary directory to store documents
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(".")
+            doc_list = zip_ref.namelist()
+            context_dir = doc_list.pop(0)  # first entry is the parent dir
+
+        if num_documents:
+            try:
+                doc_list = doc_list[0:num_documents]
+            except Exception:
+                logging.info(
+                    f"The number of documents requested was invalid ({num_documents}), defaulting to all documents ({len(doc_list)})"
+                )
+
+        logging.info(f"doc list: {doc_list}")
+
+        logging.info("Uploading context documents")
+        for doc in tqdm(doc_list):
+            with open(doc, "rb") as pdf_file:
+                vector_store_file = self.client.beta.vector_stores.files.upload(
+                    vector_store_id=self.vector_store.id, file=pdf_file
+                )
+            self.file_dict[doc] = vector_store_file.id
+
+        shutil.rmtree(context_dir)
+        logging.debug(
+            f"data in vector store: {self.client.beta.vector_stores.files.list(vector_store_id=self.vector_store.id).data}"
+        )
+
+        self.doc_list = doc_list
+
+    def _delete_context(self) -> None:
+        """Deletes the context files uploaded to the vector store"""
+        logging.info("Deleting uploaded context files...")
+        for _, file_id in self.file_dict.items():
+            self.client.files.delete(file_id=file_id)
diff --git a/src/leapfrogai_evals/utils/__init__.py b/src/leapfrogai_evals/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/leapfrogai_evals/utils/defaults.py b/src/leapfrogai_evals/utils/defaults.py
new file mode 100644
index 000000000..1680be2c0
--- /dev/null
+++ b/src/leapfrogai_evals/utils/defaults.py
@@ -0,0 +1,6 @@
+DEFAULT_INSTRUCTION_TEMPLATE = """
+                You are a helpful AI bot that answers questions for a user. Keep your response short and direct.
+                You will receive a set of context and a question that will relate to the context.
+                Do not give information outside the document or repeat your findings.
+                If the information is not available in the context respond UNANSWERABLE.
+                """