Merge pull request #12 from confident-ai/main

merge from main.
Anindyadeep · Nov 13, 2023 · d006691 · d006691
2 parents 5973760 + 0596510
commit d006691
Show file tree

Hide file tree

Showing 57 changed files with 1,257 additions and 1,170 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla
 <br />
 
 # Features
+
 - Large variety of ready-to-use evaluation metrics, ranging from LLM evaluated (G-Eval) to metrics computed via statistical methods or NLP models.
 - Easily create your own custom metrics that are automatically integrated with DeepEval's ecosystem by inheriting DeepEval's base metric class.
 - Evaluate your entire dataset in bulk using fewer than 20 lines of Python code.
@@ -68,7 +69,7 @@ Open `test_chatbot.py` and write your first test case using DeepEval:
 import pytest
 from deepeval.metrics.factual_consistency import FactualConsistencyMetric
 from deepeval.test_case import LLMTestCase
-from deepeval.run_test import assert_test
+from deepeval.evaluator import assert_test
 
 def test_case():
     input = "What if these shoes don't fit?"
@@ -92,7 +93,7 @@ deepeval test run test_chatbot.py
 - The variable `input` mimics user input, and `actual_output` is a placeholder for your chatbot's intended output based on this query.
 - The variable `context` contains the relevant information from your knowledge base, and `FactualConsistencyMetric(minimum_score=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
 - The metric score ranges from 0 - 1. The `minimum_score=0.7` threshold ultimately determines whether your test has passed or not.
-  
+
 [Read our documentation](https://docs.confident-ai.com) for more information on how to use additional metrics, create your own custom metrics, and tutorials on how to integrate with other tools like LangChain and LlamaIndex.
 
 <br />
@@ -130,17 +131,19 @@ Please read [CONTRIBUTING.md](https://github.com/confident-ai/deepeval/blob/main
 # Roadmap
 
 Features:
-- [x] Implement G-Eval 
+
+- [x] Implement G-Eval
 - [ ] Referenceless Evaluation
 - [ ] Production Evaluation & Logging
 - [ ] Evaluation Dataset Creation
 
 Integrations:
+
 - [x] lLamaIndex
 - [ ] langChain
 - [ ] Guidance
 - [ ] Guardrails
-- [ ] EmbedChain 
+- [ ] EmbedChain
 
 <br />
 

diff --git a/deepeval/_version.py b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.13"
+__version__: str = "0.20.17"
diff --git a/deepeval/api.py b/deepeval/api.py
@@ -2,27 +2,12 @@
 import platform
 import urllib.parse
 import requests
-import json
 import warnings
-from collections import defaultdict
-
-from typing import Any, Optional
-from pydantic import BaseModel, Field
-from typing import List
 from requests.adapters import HTTPAdapter, Response, Retry
-
-from deepeval.constants import (
-    API_KEY_ENV,
-    PYTEST_RUN_ENV_VAR,
-    PYTEST_RUN_TEST_NAME,
-)
+from deepeval.constants import API_KEY_ENV
 from deepeval.key_handler import KEY_FILE_HANDLER
-from deepeval.metrics.base_metric import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.tracing import TraceData, get_trace_stack
 
 API_BASE_URL = "https://app.confident-ai.com/api"
-# API_BASE_URL = "http://localhost:3000/api"
 
 # Parameters for HTTP retry
 HTTP_TOTAL_RETRIES = 3  # Number of total retries
@@ -31,184 +16,6 @@
 HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "DELETE"})
 
 
-class MetricsMetadata(BaseModel):
-    metric: str
-    score: float
-    minimum_score: float = Field(None, alias="minimumScore")
-
-
-class APITestCase(BaseModel):
-    name: str
-    input: str
-    actual_output: str = Field(..., alias="actualOutput")
-    expected_output: Optional[str] = Field(None, alias="expectedOutput")
-    success: bool
-    metrics_metadata: List[MetricsMetadata] = Field(
-        ..., alias="metricsMetadata"
-    )
-    run_duration: float = Field(..., alias="runDuration")
-    context: Optional[list] = Field(None)
-    traceStack: Optional[dict] = Field(None)
-
-
-class MetricScore(BaseModel):
-    metric: str
-    score: float
-
-    @classmethod
-    def from_metric(cls, metric: BaseMetric):
-        return cls(metric=metric.__name__, score=metric.score)
-
-
-class TestRunResponse(BaseModel):
-    """Add Test Run Results"""
-
-    testRunId: str
-    projectId: str
-
-
-class MetricDict:
-    def __init__(self):
-        self.metric_dict = {}
-        self.metric_count = {}
-
-    def add_metric(self, metric_name, score):
-        if metric_name not in self.metric_dict:
-            self.metric_dict[metric_name] = score
-            self.metric_count[metric_name] = 1
-        else:
-            self.metric_dict[metric_name] += score
-            self.metric_count[metric_name] += 1
-
-    def get_average_metric_score(self):
-        return [
-            MetricScore(
-                metric=metric,
-                score=self.metric_dict[metric] / self.metric_count[metric],
-            )
-            for metric in self.metric_dict
-        ]
-
-
-class MetricsMetadataAverageDict:
-    def __init__(self):
-        self.metric_dict = defaultdict(list)
-        self.min_score_dict = defaultdict(float)
-
-    def add_metric(self, metric: BaseMetric):
-        self.metric_dict[metric.__name__].append(metric.score)
-        self.min_score_dict[metric.__name__] = min(
-            self.min_score_dict.get(metric.__name__, float("inf")),
-            metric.minimum_score,
-        )
-
-    def get_metrics_metadata(self):
-        return [
-            MetricsMetadata(
-                metric=metric_name,
-                score=sum(scores) / len(scores),
-                minimumScore=self.min_score_dict[metric_name],
-            )
-            for metric_name, scores in self.metric_dict.items()
-        ]
-
-
-class TestRun(BaseModel):
-    test_file: Optional[str] = Field(
-        # TODO: Fix test_file
-        "test.py",
-        alias="testFile",
-    )
-    test_cases: List[APITestCase] = Field(
-        alias="testCases", default_factory=lambda: []
-    )
-    metric_scores: List[MetricScore] = Field(
-        default_factory=lambda: [], alias="metricScores"
-    )
-    configurations: dict
-
-    def add_llm_test_case(
-        self,
-        test_case: LLMTestCase,
-        metrics: List[BaseMetric],
-        run_duration: float,
-    ):
-        # Check if test case with the same ID already exists
-        # TODO: bug for pytest batch runs - unable to find test case name
-        existing_test_case: APITestCase = next(
-            (tc for tc in self.test_cases if tc.name == test_case.__name__),
-            None,
-        )
-
-        metrics_metadata_dict = MetricsMetadataAverageDict()
-        for metric in metrics:
-            metrics_metadata_dict.add_metric(metric)
-        metrics_metadata = metrics_metadata_dict.get_metrics_metadata()
-        success = all([metric.is_successful() for metric in metrics])
-
-        if existing_test_case:
-            # If it exists, append the metrics to the existing test case
-            existing_test_case.metrics_metadata.extend(metrics_metadata)
-            # Update the success status
-            existing_test_case.success = success and existing_test_case.success
-        else:
-            # If it doesn't exist, create a new test case
-            # Adding backwards compatibility to ensure context still works.
-            context = test_case.context
-            if isinstance(context, str):
-                context = [context]
-            self.test_cases.append(
-                APITestCase(
-                    # Get the test from the pytest plugin
-                    name=os.getenv(PYTEST_RUN_TEST_NAME, "-"),
-                    input=test_case.input,
-                    actualOutput=test_case.actual_output,
-                    expectedOutput=test_case.expected_output,
-                    success=success,
-                    metricsMetadata=metrics_metadata,
-                    runDuration=run_duration,
-                    context=context,
-                    traceStack=get_trace_stack(),
-                )
-            )
-
-        all_metric_dict = MetricDict()
-
-        for test_case in self.test_cases:
-            test_case: APITestCase
-            metrics = test_case.metrics_metadata
-            for metric in metrics:
-                metric: MetricsMetadata
-                all_metric_dict.add_metric(metric.metric, metric.score)
-
-        self.metric_scores = all_metric_dict.get_average_metric_score()
-
-    def save(self, file_path: Optional[str] = None):
-        if file_path is None:
-            file_path = os.getenv(PYTEST_RUN_ENV_VAR)
-            # If file Path is None, remove it
-            if not file_path:
-                return
-            elif not file_path.endswith(".json"):
-                file_path = f"{file_path}.json"
-        with open(file_path, "w") as f:
-            json.dump(self.dict(by_alias=True, exclude_none=True), f)
-
-        return file_path
-
-    @classmethod
-    def load(cls, file_path: Optional[str] = None):
-        if file_path is None:
-            file_path = os.getenv(PYTEST_RUN_ENV_VAR)
-            # If file Path is None, remove it
-            if not file_path:
-                return
-            elif not file_path.endswith(".json"):
-                file_path = f"{file_path}.json"
-        with open(file_path, "r") as f:
-            return cls(**json.load(f))
-
-
 class Api:
     """Internal Api reference for handling http operations"""
 
@@ -458,21 +265,3 @@ def quote_string(text: str) -> str:
             str: Quoted text in return
         """
         return urllib.parse.quote(text, safe="")
-
-    def post_test_run(self, test_run: TestRun) -> TestRunResponse:
-        """Post a test run"""
-        try:
-            # make sure to exclude none for `context` to ensure it is handled properly
-            body = test_run.model_dump(by_alias=True, exclude_none=True)
-        except AttributeError:
-            # Pydantic version below 2.0
-            body = test_run.dict(by_alias=True, exclude_none=True)
-
-        result = self.post_request(
-            endpoint="/v1/test-run",
-            body=body,
-        )
-        response = TestRunResponse(
-            testRunId=result["testRunId"], projectId=result["projectId"]
-        )
-        return response
diff --git a/deepeval/chat_completion/retry.py b/deepeval/chat_completion/retry.py
@@ -1,6 +1,8 @@
 from typing import Callable, Any
 import openai
 import time
+import os
+import sys
 
 
 def call_openai_with_retry(

diff --git a/deepeval/cli/examples.py b/deepeval/cli/examples.py
@@ -1,6 +1,6 @@
 CUSTOMER_EXAMPLE = """import pytest
 from deepeval.test_case import LLMTestCase
-from deepeval.run_test import assert_test
+from deepeval.evaluator import assert_test
 from deepeval.metrics.factual_consistency import FactualConsistencyMetric
 from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric
 

diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py
@@ -9,6 +9,7 @@
 from deepeval.api import Api
 from deepeval.key_handler import KEY_FILE_HANDLER
 from deepeval.cli.test import app as test_app
+import webbrowser
 
 app = typer.Typer(name="deepeval")
 
@@ -29,6 +30,7 @@ def login(
     print(
         "Grab your API key here: [link=https://app.confident-ai.com]https://app.confident-ai.com[/link] "
     )
+    webbrowser.open("https://app.confident-ai.com")
     if api_key == "":
         while True:
             api_key = input("Paste your API Key: ").strip()

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
@@ -1,16 +1,15 @@
 import pytest
 import typer
 import os
-import datetime
 from typing_extensions import Annotated
-from ..metrics.overall_score import assert_overall_score
+from deepeval.metrics.overall_score import assert_overall_score
 from .cli_key_handler import set_env_vars
-from ..constants import PYTEST_RUN_ENV_VAR
-from .examples import CUSTOMER_EXAMPLE
+from typing import Optional
+from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
+from deepeval.utils import delete_file_if_exists
 
 try:
     from rich import print
-    from rich.progress import Progress, SpinnerColumn, TextColumn
 except Exception as e:
     pass
 
@@ -79,7 +78,7 @@ def sample():
         pass
 
 
-def check_if_legit_file(test_file_or_directory: str):
+def check_if_valid_file(test_file_or_directory: str):
     if "::" in test_file_or_directory:
         test_file_or_directory, test_case = test_file_or_directory.split("::")
     if os.path.isfile(test_file_or_directory):
@@ -109,17 +108,20 @@ def run(
     show_warnings: Annotated[
         bool, typer.Option("--show-warnings", "-w/-W")
     ] = False,
+    num_processes: Optional[int] = typer.Option(
+        None,
+        "--num-processes",
+        "-n",
+        help="Number of processes to use with pytest",
+    ),
 ):
     """Run a test"""
-    check_if_legit_file(test_file_or_directory)
+    delete_file_if_exists(TEMP_FILE_NAME)
+    check_if_valid_file(test_file_or_directory)
     pytest_args = [test_file_or_directory]
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
-    # Generate environment variable based on current date and time
-    env_var = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-    os.environ[PYTEST_RUN_ENV_VAR] = env_var
-
     pytest_args.extend(
         [
             "--verbose" if verbose else "--quiet",
@@ -132,16 +134,15 @@ def run(
         pytest_args.append("--pdb")
     if not show_warnings:
         pytest_args.append("--disable-warnings")
+    if num_processes is not None:
+        pytest_args.extend(["-n", str(num_processes)])
+
     # Add the deepeval plugin file to pytest arguments
     pytest_args.extend(["-p", "plugins"])
 
     retcode = pytest.main(pytest_args)
 
-    # Print this if the run env var is not set
-    if not os.getenv(PYTEST_RUN_ENV_VAR):
-        print(
-            "✅ Tests finished! If logged in, view results on https://app.confident-ai.com/"
-        )
+    test_run_manager.wrap_up_test_run()
     return retcode
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__: str = "0.20.13"
		__version__: str = "0.20.17"