Skip to content

Commit

Permalink
Merge pull request #12 from confident-ai/main
Browse files Browse the repository at this point in the history
merge from main.
  • Loading branch information
Anindyadeep authored Nov 13, 2023
2 parents 5973760 + 0596510 commit d006691
Show file tree
Hide file tree
Showing 57 changed files with 1,257 additions and 1,170 deletions.
Binary file modified .DS_Store
Binary file not shown.
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla
<br />

# Features

- Large variety of ready-to-use evaluation metrics, ranging from LLM evaluated (G-Eval) to metrics computed via statistical methods or NLP models.
- Easily create your own custom metrics that are automatically integrated with DeepEval's ecosystem by inheriting DeepEval's base metric class.
- Evaluate your entire dataset in bulk using fewer than 20 lines of Python code.
Expand Down Expand Up @@ -68,7 +69,7 @@ Open `test_chatbot.py` and write your first test case using DeepEval:
import pytest
from deepeval.metrics.factual_consistency import FactualConsistencyMetric
from deepeval.test_case import LLMTestCase
from deepeval.run_test import assert_test
from deepeval.evaluator import assert_test

def test_case():
input = "What if these shoes don't fit?"
Expand All @@ -92,7 +93,7 @@ deepeval test run test_chatbot.py
- The variable `input` mimics user input, and `actual_output` is a placeholder for your chatbot's intended output based on this query.
- The variable `context` contains the relevant information from your knowledge base, and `FactualConsistencyMetric(minimum_score=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
- The metric score ranges from 0 - 1. The `minimum_score=0.7` threshold ultimately determines whether your test has passed or not.

[Read our documentation](https://docs.confident-ai.com) for more information on how to use additional metrics, create your own custom metrics, and tutorials on how to integrate with other tools like LangChain and LlamaIndex.

<br />
Expand Down Expand Up @@ -130,17 +131,19 @@ Please read [CONTRIBUTING.md](https://github.com/confident-ai/deepeval/blob/main
# Roadmap

Features:
- [x] Implement G-Eval

- [x] Implement G-Eval
- [ ] Referenceless Evaluation
- [ ] Production Evaluation & Logging
- [ ] Evaluation Dataset Creation

Integrations:

- [x] lLamaIndex
- [ ] langChain
- [ ] Guidance
- [ ] Guardrails
- [ ] EmbedChain
- [ ] EmbedChain

<br />

Expand Down
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.20.13"
__version__: str = "0.20.17"
213 changes: 1 addition & 212 deletions deepeval/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,12 @@
import platform
import urllib.parse
import requests
import json
import warnings
from collections import defaultdict

from typing import Any, Optional
from pydantic import BaseModel, Field
from typing import List
from requests.adapters import HTTPAdapter, Response, Retry

from deepeval.constants import (
API_KEY_ENV,
PYTEST_RUN_ENV_VAR,
PYTEST_RUN_TEST_NAME,
)
from deepeval.constants import API_KEY_ENV
from deepeval.key_handler import KEY_FILE_HANDLER
from deepeval.metrics.base_metric import BaseMetric
from deepeval.test_case import LLMTestCase
from deepeval.tracing import TraceData, get_trace_stack

API_BASE_URL = "https://app.confident-ai.com/api"
# API_BASE_URL = "http://localhost:3000/api"

# Parameters for HTTP retry
HTTP_TOTAL_RETRIES = 3 # Number of total retries
Expand All @@ -31,184 +16,6 @@
HTTP_RETRY_ALLOWED_METHODS = frozenset({"GET", "POST", "DELETE"})


class MetricsMetadata(BaseModel):
metric: str
score: float
minimum_score: float = Field(None, alias="minimumScore")


class APITestCase(BaseModel):
name: str
input: str
actual_output: str = Field(..., alias="actualOutput")
expected_output: Optional[str] = Field(None, alias="expectedOutput")
success: bool
metrics_metadata: List[MetricsMetadata] = Field(
..., alias="metricsMetadata"
)
run_duration: float = Field(..., alias="runDuration")
context: Optional[list] = Field(None)
traceStack: Optional[dict] = Field(None)


class MetricScore(BaseModel):
metric: str
score: float

@classmethod
def from_metric(cls, metric: BaseMetric):
return cls(metric=metric.__name__, score=metric.score)


class TestRunResponse(BaseModel):
"""Add Test Run Results"""

testRunId: str
projectId: str


class MetricDict:
def __init__(self):
self.metric_dict = {}
self.metric_count = {}

def add_metric(self, metric_name, score):
if metric_name not in self.metric_dict:
self.metric_dict[metric_name] = score
self.metric_count[metric_name] = 1
else:
self.metric_dict[metric_name] += score
self.metric_count[metric_name] += 1

def get_average_metric_score(self):
return [
MetricScore(
metric=metric,
score=self.metric_dict[metric] / self.metric_count[metric],
)
for metric in self.metric_dict
]


class MetricsMetadataAverageDict:
def __init__(self):
self.metric_dict = defaultdict(list)
self.min_score_dict = defaultdict(float)

def add_metric(self, metric: BaseMetric):
self.metric_dict[metric.__name__].append(metric.score)
self.min_score_dict[metric.__name__] = min(
self.min_score_dict.get(metric.__name__, float("inf")),
metric.minimum_score,
)

def get_metrics_metadata(self):
return [
MetricsMetadata(
metric=metric_name,
score=sum(scores) / len(scores),
minimumScore=self.min_score_dict[metric_name],
)
for metric_name, scores in self.metric_dict.items()
]


class TestRun(BaseModel):
test_file: Optional[str] = Field(
# TODO: Fix test_file
"test.py",
alias="testFile",
)
test_cases: List[APITestCase] = Field(
alias="testCases", default_factory=lambda: []
)
metric_scores: List[MetricScore] = Field(
default_factory=lambda: [], alias="metricScores"
)
configurations: dict

def add_llm_test_case(
self,
test_case: LLMTestCase,
metrics: List[BaseMetric],
run_duration: float,
):
# Check if test case with the same ID already exists
# TODO: bug for pytest batch runs - unable to find test case name
existing_test_case: APITestCase = next(
(tc for tc in self.test_cases if tc.name == test_case.__name__),
None,
)

metrics_metadata_dict = MetricsMetadataAverageDict()
for metric in metrics:
metrics_metadata_dict.add_metric(metric)
metrics_metadata = metrics_metadata_dict.get_metrics_metadata()
success = all([metric.is_successful() for metric in metrics])

if existing_test_case:
# If it exists, append the metrics to the existing test case
existing_test_case.metrics_metadata.extend(metrics_metadata)
# Update the success status
existing_test_case.success = success and existing_test_case.success
else:
# If it doesn't exist, create a new test case
# Adding backwards compatibility to ensure context still works.
context = test_case.context
if isinstance(context, str):
context = [context]
self.test_cases.append(
APITestCase(
# Get the test from the pytest plugin
name=os.getenv(PYTEST_RUN_TEST_NAME, "-"),
input=test_case.input,
actualOutput=test_case.actual_output,
expectedOutput=test_case.expected_output,
success=success,
metricsMetadata=metrics_metadata,
runDuration=run_duration,
context=context,
traceStack=get_trace_stack(),
)
)

all_metric_dict = MetricDict()

for test_case in self.test_cases:
test_case: APITestCase
metrics = test_case.metrics_metadata
for metric in metrics:
metric: MetricsMetadata
all_metric_dict.add_metric(metric.metric, metric.score)

self.metric_scores = all_metric_dict.get_average_metric_score()

def save(self, file_path: Optional[str] = None):
if file_path is None:
file_path = os.getenv(PYTEST_RUN_ENV_VAR)
# If file Path is None, remove it
if not file_path:
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
with open(file_path, "w") as f:
json.dump(self.dict(by_alias=True, exclude_none=True), f)

return file_path

@classmethod
def load(cls, file_path: Optional[str] = None):
if file_path is None:
file_path = os.getenv(PYTEST_RUN_ENV_VAR)
# If file Path is None, remove it
if not file_path:
return
elif not file_path.endswith(".json"):
file_path = f"{file_path}.json"
with open(file_path, "r") as f:
return cls(**json.load(f))


class Api:
"""Internal Api reference for handling http operations"""

Expand Down Expand Up @@ -458,21 +265,3 @@ def quote_string(text: str) -> str:
str: Quoted text in return
"""
return urllib.parse.quote(text, safe="")

def post_test_run(self, test_run: TestRun) -> TestRunResponse:
"""Post a test run"""
try:
# make sure to exclude none for `context` to ensure it is handled properly
body = test_run.model_dump(by_alias=True, exclude_none=True)
except AttributeError:
# Pydantic version below 2.0
body = test_run.dict(by_alias=True, exclude_none=True)

result = self.post_request(
endpoint="/v1/test-run",
body=body,
)
response = TestRunResponse(
testRunId=result["testRunId"], projectId=result["projectId"]
)
return response
2 changes: 2 additions & 0 deletions deepeval/chat_completion/retry.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from typing import Callable, Any
import openai
import time
import os
import sys


def call_openai_with_retry(
Expand Down
2 changes: 1 addition & 1 deletion deepeval/cli/examples.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
CUSTOMER_EXAMPLE = """import pytest
from deepeval.test_case import LLMTestCase
from deepeval.run_test import assert_test
from deepeval.evaluator import assert_test
from deepeval.metrics.factual_consistency import FactualConsistencyMetric
from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric
Expand Down
2 changes: 2 additions & 0 deletions deepeval/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from deepeval.api import Api
from deepeval.key_handler import KEY_FILE_HANDLER
from deepeval.cli.test import app as test_app
import webbrowser

app = typer.Typer(name="deepeval")

Expand All @@ -29,6 +30,7 @@ def login(
print(
"Grab your API key here: [link=https://app.confident-ai.com]https://app.confident-ai.com[/link] "
)
webbrowser.open("https://app.confident-ai.com")
if api_key == "":
while True:
api_key = input("Paste your API Key: ").strip()
Expand Down
33 changes: 17 additions & 16 deletions deepeval/cli/test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import pytest
import typer
import os
import datetime
from typing_extensions import Annotated
from ..metrics.overall_score import assert_overall_score
from deepeval.metrics.overall_score import assert_overall_score
from .cli_key_handler import set_env_vars
from ..constants import PYTEST_RUN_ENV_VAR
from .examples import CUSTOMER_EXAMPLE
from typing import Optional
from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
from deepeval.utils import delete_file_if_exists

try:
from rich import print
from rich.progress import Progress, SpinnerColumn, TextColumn
except Exception as e:
pass

Expand Down Expand Up @@ -79,7 +78,7 @@ def sample():
pass


def check_if_legit_file(test_file_or_directory: str):
def check_if_valid_file(test_file_or_directory: str):
if "::" in test_file_or_directory:
test_file_or_directory, test_case = test_file_or_directory.split("::")
if os.path.isfile(test_file_or_directory):
Expand Down Expand Up @@ -109,17 +108,20 @@ def run(
show_warnings: Annotated[
bool, typer.Option("--show-warnings", "-w/-W")
] = False,
num_processes: Optional[int] = typer.Option(
None,
"--num-processes",
"-n",
help="Number of processes to use with pytest",
),
):
"""Run a test"""
check_if_legit_file(test_file_or_directory)
delete_file_if_exists(TEMP_FILE_NAME)
check_if_valid_file(test_file_or_directory)
pytest_args = [test_file_or_directory]
if exit_on_first_failure:
pytest_args.insert(0, "-x")

# Generate environment variable based on current date and time
env_var = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
os.environ[PYTEST_RUN_ENV_VAR] = env_var

pytest_args.extend(
[
"--verbose" if verbose else "--quiet",
Expand All @@ -132,16 +134,15 @@ def run(
pytest_args.append("--pdb")
if not show_warnings:
pytest_args.append("--disable-warnings")
if num_processes is not None:
pytest_args.extend(["-n", str(num_processes)])

# Add the deepeval plugin file to pytest arguments
pytest_args.extend(["-p", "plugins"])

retcode = pytest.main(pytest_args)

# Print this if the run env var is not set
if not os.getenv(PYTEST_RUN_ENV_VAR):
print(
"✅ Tests finished! If logged in, view results on https://app.confident-ai.com/"
)
test_run_manager.wrap_up_test_run()
return retcode


Expand Down
Loading

0 comments on commit d006691

Please sign in to comment.