-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add LLM as judge evaluations (#960)
* add claude llm judge model * add correctness and annotation relevancy metrics * update root README and evals README * add QA eval runner * make eval runners more flexible with env vars
- Loading branch information
Showing
13 changed files
with
689 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
LEAPFROGAI_API_URL="https://leapfrogai-api.uds.dev/openai/v1" | ||
LEAPFROGAI_API_KEY="lfai-api-key" | ||
ANTHROPIC_API_KEY="anthropic-api-key" | ||
|
||
# ---- hyperparameters ---- | ||
# general | ||
MODEL_TO_EVALUATE=vllm | ||
TEMPERATURE=0.1 | ||
LLM_JUDGE=ClaudeSonnet | ||
|
||
# Needle in a Haystack | ||
NIAH_DATASET=defenseunicorns/LFAI_RAG_niah_v1 | ||
NIAH_ADD_PADDING=True | ||
NIAH_MESSAGE_PROMPT="What is the secret code?" | ||
NIAH_INSTRUCTION_TEMPLATE=DEFAULT_INSTRUCTION_TEMPLATE # this can be either a global or a string | ||
NIAH_MIN_DOC_LENGTH=4096 | ||
NIAH_MAX_DOC_LENGTH=4096 | ||
NIAH_MIN_DEPTH=0.0 | ||
NIAH_MAX_DEPTH=1.0 | ||
NIAH_NUM_COPIES=2 | ||
|
||
# Question & Answering | ||
QA_DATASET=defenseunicorns/LFAI_RAG_qa_v1 | ||
QA_INSTRUCTION_TEMPLATE=DEFAULT_INSTRUCTION_TEMPLATE # this can be either a global or a string | ||
QA_NUM_SAMPLES=25 | ||
QA_NUM_DOCUMENTS=5 | ||
#QA_VECTOR_STORE_ID= # set this to a vectore store id if you want to use an already existing vector store with the files present | ||
QA_CLEANUP_VECTOR_STORE=True # recommend setting this to False if a vector store id is provided |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import os | ||
|
||
import instructor | ||
from pydantic import BaseModel | ||
from deepeval.models.base_model import DeepEvalBaseLLM | ||
import asyncio | ||
from anthropic import Anthropic | ||
from typing import Optional | ||
|
||
|
||
class ClaudeSonnet(DeepEvalBaseLLM): | ||
"""A DeepEval LLM class that utilizes the Anthropic API to utilize Claude models""" | ||
|
||
def __init__( | ||
self, api_key: Optional[str] = None, model: str = "claude-3-5-sonnet-20240620" | ||
): | ||
self.model = model | ||
self.client = Anthropic(api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")) | ||
|
||
def load_model(self): | ||
"""Returns the current model selected""" | ||
return self.model | ||
|
||
def generate( | ||
self, | ||
prompt: str, | ||
schema: BaseModel, | ||
max_tokens: int = 1024, | ||
) -> BaseModel: | ||
"""Generates a response from the Anthropic API""" | ||
instructor_client = instructor.from_anthropic(self.client) | ||
response = instructor_client.messages.create( | ||
model=self.model, | ||
max_tokens=max_tokens, | ||
messages=[ | ||
{ | ||
"role": "user", | ||
"content": prompt, | ||
} | ||
], | ||
response_model=schema, | ||
) | ||
return response | ||
|
||
async def a_generate( | ||
self, prompt: str, schema: BaseModel, *args, **kwargs | ||
) -> BaseModel: | ||
"""Async implementation of the generate function""" | ||
loop = asyncio.get_running_loop() | ||
return await loop.run_in_executor( | ||
None, self.generate, prompt, schema, *args, **kwargs | ||
) | ||
|
||
def get_model_name(self): | ||
return f"Anthropic {self.model}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.