Skip to content

Commit

Permalink
Merge pull request #23 from confident-ai/main
Browse files Browse the repository at this point in the history
Merge from main.
  • Loading branch information
Anindyadeep authored Dec 27, 2023
2 parents 5c9448c + 87aa422 commit b7174e8
Show file tree
Hide file tree
Showing 27 changed files with 391 additions and 166 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,20 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla

# Features

- Large variety of ready-to-use evaluation metrics powered by LLMs, statistical methods, or NLP models that runs **locally on your machine**:
- Large variety of ready-to-use evaluation metrics powered by LLMs (all with explanations), statistical methods, or NLP models that runs **locally on your machine**:
- Hallucination
- Summarization
- Answer Relevancy
- Faithfulness
- Contextual Recall
- Contextual Precision
- RAGAS
- G-Eval
- Toxicity
- Bias
- etc.
- Easily create your own custom metrics that are automatically integrated with DeepEval's ecosystem by inheriting DeepEval's base metric class.
- Evaluate your entire dataset in bulk using fewer than 20 lines of Python code **in parallel**.
- Evaluate your entire dataset in bulk in under 20 lines of Python code **in parallel**.
- [Automatically integrated with Confident AI](https://app.confident-ai.com) for continous evaluation throughout the lifetime of your LLM (app):
- log evaluation results and analyze metrics pass / fails
- compare and pick the optimal hyperparameters (eg. prompt templates, chunk size, models used, etc.) based on evaluation results
Expand Down
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.20.42"
__version__: str = "0.20.43"
60 changes: 44 additions & 16 deletions deepeval/chat_completion/retry.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,47 @@
from typing import Callable, Any
import random
import time
import openai


def call_openai_with_retry(
callable: Callable[[], Any], max_retries: int = 2
) -> Any:
for _ in range(max_retries):
try:
response = callable()
return response
except Exception as e:
print(f"An error occurred: {e}. Retrying...")
time.sleep(2)
continue

raise Exception(
"Max retries reached. Unable to make a successful API call to OpenAI."
)
def retry_with_exponential_backoff(
func,
initial_delay: float = 1,
exponential_base: float = 2,
jitter: bool = True,
max_retries: int = 10,
errors: tuple = (openai.RateLimitError,),
):
"""Retry a function with exponential backoff."""

def wrapper(*args, **kwargs):
# Initialize variables
num_retries = 0
delay = initial_delay

# Loop until a successful response or max_retries is hit or an exception is raised
while True:
try:
return func(*args, **kwargs)

# Retry on specified errors
except errors as e:
# Increment retries
num_retries += 1

# Check if max retries has been reached
if num_retries > max_retries:
raise Exception(
f"Maximum number of retries ({max_retries}) exceeded."
)

# Increment the delay
delay *= exponential_base * (1 + jitter * random.random())

# Sleep for the delay
time.sleep(delay)

# Raise exceptions for any errors not specified
except Exception as e:
raise e

return wrapper
66 changes: 0 additions & 66 deletions deepeval/cli/azure_openai.py

This file was deleted.

57 changes: 55 additions & 2 deletions deepeval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@
pass
from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues
from deepeval.cli.test import app as test_app
from deepeval.cli.azure_openai import app as azure_openai_app
from typing import Optional
import webbrowser

app = typer.Typer(name="deepeval")

app.add_typer(test_app, name="test")
app.add_typer(azure_openai_app, name="azure-openai")


@app.command()
Expand Down Expand Up @@ -58,5 +56,60 @@ def login(
)


@app.command(name="set-azure-openai")
def set_azure_openai_env(
azure_openai_api_key: str = typer.Option(
..., "--openai-api-key", help="Azure OpenAI API key"
),
azure_openai_endpoint: str = typer.Option(
..., "--openai-endpoint", help="Azure OpenAI endpoint"
),
openai_api_version: str = typer.Option(
..., "--openai-api-version", help="OpenAI API version"
),
azure_deployment_name: str = typer.Option(
..., "--deployment-name", help="Azure deployment name"
),
azure_model_version: Optional[str] = typer.Option(
None, "--model-version", help="Azure model version (optional)"
),
):
KEY_FILE_HANDLER.write_key(
KeyValues.AZURE_OPENAI_API_KEY, azure_openai_api_key
)
KEY_FILE_HANDLER.write_key(
KeyValues.AZURE_OPENAI_ENDPOINT, azure_openai_endpoint
)
KEY_FILE_HANDLER.write_key(KeyValues.OPENAI_API_VERSION, openai_api_version)
KEY_FILE_HANDLER.write_key(
KeyValues.AZURE_DEPLOYMENT_NAME, azure_deployment_name
)

if azure_model_version is not None:
KEY_FILE_HANDLER.write_key(
KeyValues.AZURE_MODEL_VERSION, azure_model_version
)

KEY_FILE_HANDLER.write_key(KeyValues.USE_AZURE_OPENAI, "YES")

print(
":raising_hands: Congratulations! You're now using Azure OpenAI for all evals that require an LLM."
)


@app.command(name="unset-azure-openai")
def unset_azure_openai_env():
KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_OPENAI_API_KEY)
KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_OPENAI_ENDPOINT)
KEY_FILE_HANDLER.remove_key(KeyValues.OPENAI_API_VERSION)
KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_DEPLOYMENT_NAME)
KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_MODEL_VERSION)
KEY_FILE_HANDLER.remove_key(KeyValues.USE_AZURE_OPENAI)

print(
":raising_hands: Congratulations! You're now using regular OpenAI for all evals that require an LLM."
)


if __name__ == "__main__":
app()
7 changes: 6 additions & 1 deletion deepeval/metrics/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ def __init__(
self,
minimum_score: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.model = model
self.include_reason = include_reason
self.n = 5

def measure(self, test_case: LLMTestCase) -> float:
Expand All @@ -34,7 +36,7 @@ def measure(self, test_case: LLMTestCase) -> float:
"Input, actual output, or retrieval context cannot be None"
)
print(
"✨ 🍰 ✨ You're using DeepEval's newest Answer Relevancy Metric! This may take a minute."
"✨ 🍰 ✨ You're using DeepEval's latest Answer Relevancy Metric! This may take a minute..."
)
self.key_points: List[str] = self._generate_key_points(
test_case.actual_output, "\n".join(test_case.retrieval_context)
Expand Down Expand Up @@ -63,6 +65,9 @@ def _generate_score(self):
def _generate_reason(
self, original_question: str, answer: str, score: float
) -> str:
if self.include_reason is False:
return None

irrelevant_points = []
for verdict in self.verdicts:
if verdict.verdict.strip().lower() == "no":
Expand Down
7 changes: 6 additions & 1 deletion deepeval/metrics/contextual_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def __init__(
self,
minimum_score: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.include_reason = include_reason
self.model = model

def measure(self, test_case: LLMTestCase) -> float:
Expand All @@ -35,7 +37,7 @@ def measure(self, test_case: LLMTestCase) -> float:
"Input, actual output, expected output, or retrieval context cannot be None"
)
print(
"✨ 🍰 ✨ You're using DeepEval's newest Contextual Precision Metric! This may take a minute."
"✨ 🍰 ✨ You're using DeepEval's latest Contextual Precision Metric! This may take a minute..."
)
self.verdicts: List[
ContextualPrecisionVerdict
Expand All @@ -55,6 +57,9 @@ def measure(self, test_case: LLMTestCase) -> float:
return self.score

def _generate_reason(self, input: str, score: float):
if self.include_reason is False:
return None

retrieval_contexts_verdicts = [
{
"verdict": verdict.verdict,
Expand Down
7 changes: 6 additions & 1 deletion deepeval/metrics/contextual_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ def __init__(
self,
minimum_score: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.model = model
self.include_reason = include_reason
self.n = 5

def measure(self, test_case: LLMTestCase) -> float:
Expand All @@ -35,7 +37,7 @@ def measure(self, test_case: LLMTestCase) -> float:
"Input, actual output, expected output, or retrieval context cannot be None"
)
print(
"✨ 🍰 ✨ You're using DeepEval's newest Contextual Recall Metric! This may take a minute."
"✨ 🍰 ✨ You're using DeepEval's latest Contextual Recall Metric! This may take a minute..."
)
self.verdicts: List[ContextualRecallVerdict] = self._generate_verdicts(
test_case.expected_output, test_case.retrieval_context
Expand All @@ -52,6 +54,9 @@ def measure(self, test_case: LLMTestCase) -> float:
return self.score

def _generate_reason(self, expected_output: str, score: float):
if self.include_reason is False:
return None

supportive_reasons = []
unsupportive_reasons = []
for verdict in self.verdicts:
Expand Down
7 changes: 6 additions & 1 deletion deepeval/metrics/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ def __init__(
self,
minimum_score: float = 0.5,
model: Optional[str] = "gpt-4",
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.model = model
self.include_reason = include_reason

def measure(self, test_case: LLMTestCase) -> float:
if (
Expand All @@ -34,7 +36,7 @@ def measure(self, test_case: LLMTestCase) -> float:
"Input, actual output, or retrieval context cannot be None"
)
print(
"✨ 🍰 ✨ You're using DeepEval's newest Contextual Relevancy Metric! This may take a minute."
"✨ 🍰 ✨ You're using DeepEval's latest Contextual Relevancy Metric! This may take a minute..."
)
self.verdicts_list: List[
List[ContextualRelevancyVerdict]
Expand All @@ -53,6 +55,9 @@ def measure(self, test_case: LLMTestCase) -> float:
return self.score

def _generate_reason(self, input: str, score: float):
if self.include_reason is False:
return None

irrelevant_sentences = []
for index, verdicts in enumerate(self.verdicts_list):
for verdict in verdicts:
Expand Down
Loading

0 comments on commit b7174e8

Please sign in to comment.