Merge pull request #23 from confident-ai/main

Merge from main.
Anindyadeep · Dec 27, 2023 · b7174e8 · b7174e8
2 parents 5c9448c + 87aa422
commit b7174e8
Show file tree

Hide file tree

Showing 27 changed files with 391 additions and 166 deletions.
diff --git a/README.md b/README.md
@@ -26,16 +26,20 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla
 
 # Features
 
-- Large variety of ready-to-use evaluation metrics powered by LLMs, statistical methods, or NLP models that runs **locally on your machine**:
+- Large variety of ready-to-use evaluation metrics powered by LLMs (all with explanations), statistical methods, or NLP models that runs **locally on your machine**:
   - Hallucination
+  - Summarization
   - Answer Relevancy
+  - Faithfulness
+  - Contextual Recall
+  - Contextual Precision
   - RAGAS
   - G-Eval
   - Toxicity
   - Bias
   - etc.
 - Easily create your own custom metrics that are automatically integrated with DeepEval's ecosystem by inheriting DeepEval's base metric class.
-- Evaluate your entire dataset in bulk using fewer than 20 lines of Python code **in parallel**.
+- Evaluate your entire dataset in bulk in under 20 lines of Python code **in parallel**.
 - [Automatically integrated with Confident AI](https://app.confident-ai.com) for continous evaluation throughout the lifetime of your LLM (app):
   - log evaluation results and analyze metrics pass / fails
   - compare and pick the optimal hyperparameters (eg. prompt templates, chunk size, models used, etc.) based on evaluation results

diff --git a/deepeval/_version.py b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.42"
+__version__: str = "0.20.43"
diff --git a/deepeval/chat_completion/retry.py b/deepeval/chat_completion/retry.py
@@ -1,19 +1,47 @@
-from typing import Callable, Any
+import random
 import time
+import openai
 
 
-def call_openai_with_retry(
-    callable: Callable[[], Any], max_retries: int = 2
-) -> Any:
-    for _ in range(max_retries):
-        try:
-            response = callable()
-            return response
-        except Exception as e:
-            print(f"An error occurred: {e}. Retrying...")
-            time.sleep(2)
-            continue
-
-    raise Exception(
-        "Max retries reached. Unable to make a successful API call to OpenAI."
-    )
+def retry_with_exponential_backoff(
+    func,
+    initial_delay: float = 1,
+    exponential_base: float = 2,
+    jitter: bool = True,
+    max_retries: int = 10,
+    errors: tuple = (openai.RateLimitError,),
+):
+    """Retry a function with exponential backoff."""
+
+    def wrapper(*args, **kwargs):
+        # Initialize variables
+        num_retries = 0
+        delay = initial_delay
+
+        # Loop until a successful response or max_retries is hit or an exception is raised
+        while True:
+            try:
+                return func(*args, **kwargs)
+
+            # Retry on specified errors
+            except errors as e:
+                # Increment retries
+                num_retries += 1
+
+                # Check if max retries has been reached
+                if num_retries > max_retries:
+                    raise Exception(
+                        f"Maximum number of retries ({max_retries}) exceeded."
+                    )
+
+                # Increment the delay
+                delay *= exponential_base * (1 + jitter * random.random())
+
+                # Sleep for the delay
+                time.sleep(delay)
+
+            # Raise exceptions for any errors not specified
+            except Exception as e:
+                raise e
+
+    return wrapper
diff --git a/deepeval/cli/azure_openai.py b/deepeval/cli/azure_openai.py
diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py
@@ -8,14 +8,12 @@
     pass
 from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues
 from deepeval.cli.test import app as test_app
-from deepeval.cli.azure_openai import app as azure_openai_app
 from typing import Optional
 import webbrowser
 
 app = typer.Typer(name="deepeval")
 
 app.add_typer(test_app, name="test")
-app.add_typer(azure_openai_app, name="azure-openai")
 
 
 @app.command()
@@ -58,5 +56,60 @@ def login(
     )
 
 
+@app.command(name="set-azure-openai")
+def set_azure_openai_env(
+    azure_openai_api_key: str = typer.Option(
+        ..., "--openai-api-key", help="Azure OpenAI API key"
+    ),
+    azure_openai_endpoint: str = typer.Option(
+        ..., "--openai-endpoint", help="Azure OpenAI endpoint"
+    ),
+    openai_api_version: str = typer.Option(
+        ..., "--openai-api-version", help="OpenAI API version"
+    ),
+    azure_deployment_name: str = typer.Option(
+        ..., "--deployment-name", help="Azure deployment name"
+    ),
+    azure_model_version: Optional[str] = typer.Option(
+        None, "--model-version", help="Azure model version (optional)"
+    ),
+):
+    KEY_FILE_HANDLER.write_key(
+        KeyValues.AZURE_OPENAI_API_KEY, azure_openai_api_key
+    )
+    KEY_FILE_HANDLER.write_key(
+        KeyValues.AZURE_OPENAI_ENDPOINT, azure_openai_endpoint
+    )
+    KEY_FILE_HANDLER.write_key(KeyValues.OPENAI_API_VERSION, openai_api_version)
+    KEY_FILE_HANDLER.write_key(
+        KeyValues.AZURE_DEPLOYMENT_NAME, azure_deployment_name
+    )
+
+    if azure_model_version is not None:
+        KEY_FILE_HANDLER.write_key(
+            KeyValues.AZURE_MODEL_VERSION, azure_model_version
+        )
+
+    KEY_FILE_HANDLER.write_key(KeyValues.USE_AZURE_OPENAI, "YES")
+
+    print(
+        ":raising_hands: Congratulations! You're now using Azure OpenAI for all evals that require an LLM."
+    )
+
+
+@app.command(name="unset-azure-openai")
+def unset_azure_openai_env():
+    KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_OPENAI_API_KEY)
+    KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_OPENAI_ENDPOINT)
+    KEY_FILE_HANDLER.remove_key(KeyValues.OPENAI_API_VERSION)
+    KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_DEPLOYMENT_NAME)
+    KEY_FILE_HANDLER.remove_key(KeyValues.AZURE_MODEL_VERSION)
+    KEY_FILE_HANDLER.remove_key(KeyValues.USE_AZURE_OPENAI)
+
+    print(
+        ":raising_hands: Congratulations! You're now using regular OpenAI for all evals that require an LLM."
+    )
+
+
 if __name__ == "__main__":
     app()
diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py
@@ -19,9 +19,11 @@ def __init__(
         self,
         minimum_score: float = 0.5,
         model: Optional[str] = None,
+        include_reason: bool = True,
     ):
         self.minimum_score = minimum_score
         self.model = model
+        self.include_reason = include_reason
         self.n = 5
 
     def measure(self, test_case: LLMTestCase) -> float:
@@ -34,7 +36,7 @@ def measure(self, test_case: LLMTestCase) -> float:
                 "Input, actual output, or retrieval context cannot be None"
             )
         print(
-            "✨ 🍰 ✨ You're using DeepEval's newest Answer Relevancy Metric! This may take a minute."
+            "✨ 🍰 ✨ You're using DeepEval's latest Answer Relevancy Metric! This may take a minute..."
         )
         self.key_points: List[str] = self._generate_key_points(
             test_case.actual_output, "\n".join(test_case.retrieval_context)
@@ -63,6 +65,9 @@ def _generate_score(self):
     def _generate_reason(
         self, original_question: str, answer: str, score: float
     ) -> str:
+        if self.include_reason is False:
+            return None
+
         irrelevant_points = []
         for verdict in self.verdicts:
             if verdict.verdict.strip().lower() == "no":

diff --git a/deepeval/metrics/contextual_precision.py b/deepeval/metrics/contextual_precision.py
@@ -20,8 +20,10 @@ def __init__(
         self,
         minimum_score: float = 0.5,
         model: Optional[str] = None,
+        include_reason: bool = True,
     ):
         self.minimum_score = minimum_score
+        self.include_reason = include_reason
         self.model = model
 
     def measure(self, test_case: LLMTestCase) -> float:
@@ -35,7 +37,7 @@ def measure(self, test_case: LLMTestCase) -> float:
                 "Input, actual output, expected output, or retrieval context cannot be None"
             )
         print(
-            "✨ 🍰 ✨ You're using DeepEval's newest Contextual Precision Metric! This may take a minute."
+            "✨ 🍰 ✨ You're using DeepEval's latest Contextual Precision Metric! This may take a minute..."
         )
         self.verdicts: List[
             ContextualPrecisionVerdict
@@ -55,6 +57,9 @@ def measure(self, test_case: LLMTestCase) -> float:
         return self.score
 
     def _generate_reason(self, input: str, score: float):
+        if self.include_reason is False:
+            return None
+
         retrieval_contexts_verdicts = [
             {
                 "verdict": verdict.verdict,

diff --git a/deepeval/metrics/contextual_recall.py b/deepeval/metrics/contextual_recall.py
@@ -19,9 +19,11 @@ def __init__(
         self,
         minimum_score: float = 0.5,
         model: Optional[str] = None,
+        include_reason: bool = True,
     ):
         self.minimum_score = minimum_score
         self.model = model
+        self.include_reason = include_reason
         self.n = 5
 
     def measure(self, test_case: LLMTestCase) -> float:
@@ -35,7 +37,7 @@ def measure(self, test_case: LLMTestCase) -> float:
                 "Input, actual output, expected output, or retrieval context cannot be None"
             )
         print(
-            "✨ 🍰 ✨ You're using DeepEval's newest Contextual Recall Metric! This may take a minute."
+            "✨ 🍰 ✨ You're using DeepEval's latest Contextual Recall Metric! This may take a minute..."
         )
         self.verdicts: List[ContextualRecallVerdict] = self._generate_verdicts(
             test_case.expected_output, test_case.retrieval_context
@@ -52,6 +54,9 @@ def measure(self, test_case: LLMTestCase) -> float:
         return self.score
 
     def _generate_reason(self, expected_output: str, score: float):
+        if self.include_reason is False:
+            return None
+
         supportive_reasons = []
         unsupportive_reasons = []
         for verdict in self.verdicts:

diff --git a/deepeval/metrics/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy.py
@@ -20,9 +20,11 @@ def __init__(
         self,
         minimum_score: float = 0.5,
         model: Optional[str] = "gpt-4",
+        include_reason: bool = True,
     ):
         self.minimum_score = minimum_score
         self.model = model
+        self.include_reason = include_reason
 
     def measure(self, test_case: LLMTestCase) -> float:
         if (
@@ -34,7 +36,7 @@ def measure(self, test_case: LLMTestCase) -> float:
                 "Input, actual output, or retrieval context cannot be None"
             )
         print(
-            "✨ 🍰 ✨ You're using DeepEval's newest Contextual Relevancy Metric! This may take a minute."
+            "✨ 🍰 ✨ You're using DeepEval's latest Contextual Relevancy Metric! This may take a minute..."
         )
         self.verdicts_list: List[
             List[ContextualRelevancyVerdict]
@@ -53,6 +55,9 @@ def measure(self, test_case: LLMTestCase) -> float:
         return self.score
 
     def _generate_reason(self, input: str, score: float):
+        if self.include_reason is False:
+            return None
+
         irrelevant_sentences = []
         for index, verdicts in enumerate(self.verdicts_list):
             for verdict in verdicts:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__: str = "0.20.42"
		__version__: str = "0.20.43"