diff --git a/.github/workflows/deepeval-results.yml b/.github/workflows/deepeval-results.yml
index 0e0a2ae86..865cff5b7 100644
--- a/.github/workflows/deepeval-results.yml
+++ b/.github/workflows/deepeval-results.yml
@@ -37,9 +37,6 @@ jobs:
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
run: poetry install --no-interaction
- - name: Run tests without pytest
- run: poetry run python tests/test_without_pytest.py
-
- name: Run deepeval tests and capture output
run: poetry run deepeval test run tests/test_quickstart.py > output.txt 2>&1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 14dd22dbb..381a8c2e0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -65,4 +65,4 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
- poetry run pytest tests/ --ignore=tests/test_llm_metric.py --ignore=tests/test_overall_score.py
+ poetry run pytest tests/ --ignore=tests/test_llm_metric.py
diff --git a/README.md b/README.md
index 2f71d3678..d07f3f891 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ from deepeval.evaluator import assert_test
def test_case():
input = "What if these shoes don't fit?"
- context = "All customers are eligible for a 30 day full refund at no extra costs."
+ context = ["All customers are eligible for a 30 day full refund at no extra costs."]
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra costs."
@@ -118,7 +118,7 @@ deepeval test run test_chatbot.py
You should see a link displayed in the CLI once the test has finished running. Paste it into your browser to view the results!
-![ok](https://d2lsxfc3p6r9rv.cloudfront.net/dashboard.png)
+![ok](https://d2lsxfc3p6r9rv.cloudfront.net/test-summary.png)
diff --git a/deepeval/_version.py b/deepeval/_version.py
index 171aba412..9d0d5a8eb 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.17"
+__version__: str = "0.20.19"
diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 2e6d3344d..239327c7f 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -2,8 +2,6 @@
import typer
import os
from typing_extensions import Annotated
-from deepeval.metrics.overall_score import assert_overall_score
-from .cli_key_handler import set_env_vars
from typing import Optional
from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
from deepeval.utils import delete_file_if_exists
@@ -17,67 +15,6 @@
app = typer.Typer(name="test")
-def sample():
- set_env_vars()
- print("Sending sample test results...")
- print(
- "If this is your first time running these models, it may take a while."
- )
- try:
- query = "How does photosynthesis work?"
- output = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll pigment."
- expected_output = "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize food with the help of chlorophyll pigment."
- context = "Biology"
-
- assert_overall_score(query, output, expected_output, context)
-
- except AssertionError as e:
- pass
- try:
- query = "What is the capital of France?"
- output = "The capital of France is Paris."
- expected_output = "The capital of France is Paris."
- context = "Geography"
-
- assert_overall_score(query, output, expected_output, context)
-
- except AssertionError as e:
- pass
- try:
- query = "What are the major components of a cell?"
- output = "Cells have many major components, including the cell membrane, nucleus, mitochondria, and endoplasmic reticulum."
- expected_output = "Cells have several major components, such as the cell membrane, nucleus, mitochondria, and endoplasmic reticulum."
- context = "Biology"
- minimum_score = 0.8 # Adjusting the minimum score threshold
-
- assert_overall_score(
- query, output, expected_output, context, minimum_score
- )
-
- except AssertionError as e:
- pass
-
- try:
- query = "What is the capital of Japan?"
- output = "The largest city in Japan is Tokyo."
- expected_output = "The capital of Japan is Tokyo."
- context = "Geography"
-
- assert_overall_score(query, output, expected_output, context)
- except AssertionError as e:
- pass
-
- try:
- query = "Explain the theory of relativity."
- output = "Einstein's theory of relativity is famous."
- expected_output = "Einstein's theory of relativity revolutionized our understanding of space, time, and gravity."
- context = "Physics"
-
- assert_overall_score(query, output, expected_output, context)
- except AssertionError as e:
- pass
-
-
def check_if_valid_file(test_file_or_directory: str):
if "::" in test_file_or_directory:
test_file_or_directory, test_case = test_file_or_directory.split("::")
diff --git a/deepeval/dataset.py b/deepeval/dataset.py
index 4433fe27b..d8a9fb1a4 100644
--- a/deepeval/dataset.py
+++ b/deepeval/dataset.py
@@ -1,593 +1,220 @@
-"""Class for Evaluation Datasets
-"""
+from typing import List, Optional
+from dataclasses import dataclass
+import pandas as pd
import json
-import random
-import time
-from collections import UserList
-from datetime import datetime
-from typing import Any, Callable, List, Optional
-from tabulate import tabulate
-
-from deepeval.evaluator import run_test
-from deepeval.metrics.base_metric import BaseMetric
+from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
-from dataclasses import asdict
+from deepeval.evaluator import evaluate
-class EvaluationDataset(UserList):
- """Class for Evaluation Datasets - which are a list of test cases"""
+@dataclass
+class EvaluationDataset:
+ test_cases: List[LLMTestCase]
- def __init__(self, test_cases: List[LLMTestCase]):
- self.data: List[LLMTestCase] = test_cases
+ def __init__(self, test_cases: List[LLMTestCase] = []):
+ self.test_cases = test_cases
- @classmethod
- def from_csv(
- cls, # Use 'cls' instead of 'self' for class methods
- csv_filename: str,
- query_column: Optional[str] = None,
- expected_output_column: Optional[str] = None,
- context_column: Optional[str] = None,
- output_column: Optional[str] = None,
- id_column: str = None,
- metrics: List[BaseMetric] = None,
- ):
- import pandas as pd
-
- df = pd.read_csv(csv_filename)
- if query_column is not None and query_column in df.columns:
- querys = df[query_column].values
- else:
- querys = [None] * len(df)
- if (
- expected_output_column is not None
- and expected_output_column in df.columns
- ):
- expected_outputs = df[expected_output_column].values
- else:
- expected_outputs = [None] * len(df)
- if context_column is not None and context_column in df.columns:
- contexts = df[context_column].values
- else:
- contexts = [None] * len(df)
- if output_column is not None and output_column in df.columns:
- outputs = df[output_column].values
- else:
- outputs = [None] * len(df)
- if id_column is not None:
- ids = df[id_column].values
- else:
- ids = [None] * len(df)
-
- # Initialize the 'data' attribute as an empty list
- cls.data = []
-
- for i, query_data in enumerate(querys):
- cls.data.append(
- LLMTestCase(
- input=query_data,
- expected_output=expected_outputs[i],
- context=contexts[i],
- id=ids[i] if id_column else None,
- actual_output=outputs[i] if output_column else None,
- )
- )
- return cls(cls.data)
+ def add_test_case(self, test_case: LLMTestCase):
+ self.test_cases.append(test_case)
- def from_test_cases(self, test_cases: list):
- self.data = test_cases
+ def __iter__(self):
+ return iter(self.test_cases)
- @classmethod
- def from_hf_dataset(
- cls,
- dataset_name: str,
- split: str,
- query_column: str,
- expected_output_column: str,
- context_column: str = None,
- output_column: str = None,
- id_column: str = None,
+ def evaluate(self, metrics: List[BaseMetric]):
+ return evaluate(self.test_cases, metrics)
+
+ def load_csv(
+ self,
+ file_path: str,
+ input_col_name: str,
+ actual_output_col_name: str,
+ expected_output_col_name: Optional[str] = None,
+ context_col_name: Optional[str] = None,
+ context_col_delimiter: str = ";",
):
"""
- Load test cases from a HuggingFace dataset.
+ Load test cases from a CSV file.
+
+ This method reads a CSV file, extracting test case data based on specified column names. It creates LLMTestCase objects for each row in the CSV and adds them to the Dataset instance. The context data, if provided, is expected to be a delimited string in the CSV, which this method will parse into a list.
Args:
- dataset_name (str): The name of the HuggingFace dataset to load.
- split (str): The split of the dataset to load (e.g., 'train', 'test').
- query_column (str): The column in the dataset corresponding to the query.
- expected_output_column (str): The column in the dataset corresponding to the expected output.
- context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None.
- output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None.
- id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None.
+ file_path (str): Path to the CSV file containing the test cases.
+ input_col_name (str): The column name in the CSV corresponding to the input for the test case.
+ actual_output_col_name (str): The column name in the CSV corresponding to the actual output for the test case.
+ expected_output_col_name (str, optional): The column name in the CSV corresponding to the expected output for the test case. Defaults to None.
+ context_col_name (str, optional): The column name in the CSV corresponding to the context for the test case. Defaults to None.
+ context_delimiter (str, optional): The delimiter used to separate items in the context list within the CSV file. Defaults to ';'.
Returns:
- EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
- """
- try:
- from datasets import load_dataset
- except ImportError:
- raise ImportError(
- "The 'datasets' library is missing. Please install it using pip: pip install datasets"
- )
+ None: The method adds test cases to the Dataset instance but does not return anything.
- hf_dataset = load_dataset(dataset_name, split=split)
- test_cases = []
+ Raises:
+ FileNotFoundError: If the CSV file specified by `file_path` cannot be found.
+ pd.errors.EmptyDataError: If the CSV file is empty.
+ KeyError: If one or more specified columns are not found in the CSV file.
- for i, row in enumerate(hf_dataset):
- test_cases.append(
- LLMTestCase(
- input=row[query_column],
- expected_output=row[expected_output_column],
- context=row[context_column] if context_column else None,
- actual_output=row[output_column] if output_column else None,
- id=row[id_column] if id_column else None,
- )
- )
- return cls(test_cases)
-
- @classmethod
- def from_json(
- cls,
- json_filename: str,
- query_column: str,
- expected_output_column: str,
- context_column: str,
- output_column: str,
- id_column: str = None,
- ):
- """
- This is for JSON data in the format of key-value array pairs.
- {
- "query": ["What is the customer success number", "What is the customer success number"],
- "context": ["Context 1", "Context 2"],
- "output": ["Output 1", "Output 2"]
- }
-
- if the JSON data is in a list of dictionaries, use from_json_list
+ Note:
+ The CSV file is expected to contain columns as specified in the arguments. Each row in the file represents a single test case. The method assumes the file is properly formatted and the specified columns exist. For context data represented as lists in the CSV, ensure the correct delimiter is specified.
"""
- with open(json_filename, "r") as f:
- data = json.load(f)
- test_cases = []
+ df = pd.read_csv(file_path)
- for i, query in enumerate(data[query_column]):
- test_cases.append(
- LLMTestCase(
- input=data[query_column][i],
- expected_output=data[expected_output_column][i],
- context=data[context_column][i],
- actual_output=data[output_column][i],
- id=data[id_column][i] if id_column else None,
- )
+ inputs = self._get_column_data(df, input_col_name)
+ actual_outputs = self._get_column_data(df, actual_output_col_name)
+ expected_outputs = self._get_column_data(
+ df, expected_output_col_name, default=None
+ )
+ contexts = [
+ context.split(context_col_delimiter) if context else []
+ for context in self._get_column_data(
+ df, context_col_name, default=""
)
- return cls(test_cases)
-
- @classmethod
- def from_json_list(
- cls,
- json_filename: str,
- query_column: str,
- expected_output_column: str,
- context_column: str,
- output_column: str,
- id_column: str = None,
- ):
- """
- This is for JSON data in the format of a list of dictionaries.
- [
- {"query": "What is the customer success number", "expected_output": "What is the customer success number", "context": "Context 1", "output": "Output 1"},
]
- """
- with open(json_filename, "r") as f:
- data = json.load(f)
- test_cases = []
- for i, query in enumerate(data):
- test_cases.append(
+
+ for input, actual_output, expected_output, context in zip(
+ inputs, actual_outputs, expected_outputs, contexts
+ ):
+ self.add_test_case(
LLMTestCase(
- input=data[i][query_column],
- expected_output=data[i][expected_output_column],
- context=data[i][context_column],
- actual_output=data[i][output_column],
- id=data[i][id_column] if id_column else None,
+ input=input,
+ actual_output=actual_output,
+ expected_output=expected_output,
+ context=context,
)
)
- return cls(test_cases)
-
- @classmethod
- def from_dict(
- cls,
- data: List[dict],
- query_key: str,
- expected_output_key: str,
- context_key: str = None,
- output_key: str = None,
- id_key: str = None,
+
+ def _get_column_data(self, df: pd.DataFrame, col_name: str, default=None):
+ return (
+ df[col_name].values
+ if col_name in df.columns
+ else [default] * len(df)
+ )
+
+ def load_json_list(
+ self,
+ file_path: str,
+ input_key_name: str,
+ actual_output_key_name: str,
+ expected_output_key_name: Optional[str] = None,
+ context_key_name: Optional[str] = None,
):
"""
- Load test cases from a list of dictionaries.
+ Load test cases from a JSON file.
+
+ This method reads a JSON file containing a list of objects, each representing a test case. It extracts the necessary information based on specified key names and creates LLMTestCase objects to add to the Dataset instance.
Args:
- data (List[dict]): The list of dictionaries containing the test case data.
- query_key (str): The key in each dictionary corresponding to the query.
- expected_output_key (str): The key in each dictionary corresponding to the expected output.
- context_key (str, optional): The key in each dictionary corresponding to the context. Defaults to None.
- output_key (str, optional): The key in each dictionary corresponding to the output. Defaults to None.
- id_key (str, optional): The key in each dictionary corresponding to the ID. Defaults to None.
- metrics (List[BaseMetric], optional): The list of metrics to be associated with the test cases. Defaults to None.
+ file_path (str): Path to the JSON file containing the test cases.
+ input_key_name (str): The key name in the JSON objects corresponding to the input for the test case.
+ actual_output_key_name (str): The key name in the JSON objects corresponding to the actual output for the test case.
+ expected_output_key_name (str, optional): The key name in the JSON objects corresponding to the expected output for the test case. Defaults to None.
+ context_key_name (str, optional): The key name in the JSON objects corresponding to the context for the test case. Defaults to None.
Returns:
- EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
- """
- test_cases = []
- for i, case_data in enumerate(data):
- test_cases.append(
- LLMTestCase(
- input=case_data[query_key],
- expected_output=case_data[expected_output_key],
- context=case_data[context_key] if context_key else None,
- actual_output=case_data[output_key] if output_key else None,
- id=case_data[id_key] if id_key else None,
- )
- )
- return cls(test_cases)
-
- def to_dict(self):
- return [asdict(x) for x in self.data]
+ None: The method adds test cases to the Dataset instance but does not return anything.
- def to_csv(self, csv_filename: str):
- import pandas as pd
+ Raises:
+ FileNotFoundError: If the JSON file specified by `file_path` cannot be found.
+ ValueError: If the JSON file is not valid or if required keys (input and actual output) are missing in one or more JSON objects.
- df = pd.DataFrame(self.data)
- df.to_csv(csv_filename, index=False)
-
- def to_json(self, json_filename: str):
- with open(json_filename, "w") as f:
- json.dump(self.data, f)
+ Note:
+ The JSON file should be structured as a list of objects, with each object containing the required keys. The method assumes the file format and keys are correctly defined and present.
+ """
+ try:
+ with open(file_path, "r") as file:
+ json_list = json.load(file)
+ except FileNotFoundError:
+ raise FileNotFoundError(f"The file {file_path} was not found.")
+ except json.JSONDecodeError:
+ raise ValueError(f"The file {file_path} is not a valid JSON file.")
+
+ # Process each JSON object
+ for json_obj in json_list:
+ if (
+ input_key_name not in json_obj
+ or actual_output_key_name not in json_obj
+ ):
+ raise ValueError(
+ "Required fields are missing in one or more JSON objects"
+ )
- def from_hf_evals(self):
- raise NotImplementedError
+ input = json_obj[input_key_name]
+ actual_output = json_obj[actual_output_key_name]
+ expected_output = json_obj.get(expected_output_key_name)
+ context = json_obj.get(context_key_name)
- def from_df(self):
- raise NotImplementedError
+ self.add_test_case(
+ LLMTestCase(
+ input=input,
+ actual_output=actual_output,
+ expected_output=expected_output,
+ context=context,
+ )
+ )
- def __repr__(self):
- return f"{self.__class__.__name__}({self.data})"
+ def load_hf_dataset(
+ self,
+ dataset_name: str,
+ input_field_name: str,
+ actual_output_field_name: str,
+ expected_output_field_name: Optional[str] = None,
+ context_field_name: Optional[str] = None,
+ split: str = "train",
+ ):
+ """
+ Load test cases from a Hugging Face dataset.
- def sample(self, n: int = 5):
- if len(self.data) <= n:
- n = len(self.data)
- result = random.sample(self.data, n)
- return [asdict(r) for r in result]
+ This method loads a specified dataset and split from Hugging Face's datasets library, then iterates through each entry to create and add LLMTestCase objects to the Dataset instance based on specified field names.
- def head(self, n: int = 5):
- return self.data[:n]
+ Args:
+ dataset_name (str): The name of the Hugging Face dataset to load.
+ split (str): The split of the dataset to load (e.g., 'train', 'test', 'validation'). Defaults to 'train'.
+ input_field_name (str): The field name in the dataset corresponding to the input for the test case.
+ actual_output_field_name (str): The field name in the dataset corresponding to the actual output for the test case.
+ expected_output_field_name (str, optional): The field name in the dataset corresponding to the expected output for the test case. Defaults to None.
+ context_field_name (str, optional): The field name in the dataset corresponding to the context for the test case. Defaults to None.
- def __getitem__(self, index):
- return self.data[index]
+ Returns:
+ None: The method adds test cases to the Dataset instance but does not return anything.
- def __setitem__(self, index, value):
- self.data[index] = value
+ Raises:
+ ValueError: If the required fields (input and actual output) are not found in the dataset.
+ FileNotFoundError: If the specified dataset is not available in Hugging Face's datasets library.
+ datasets.DatasetNotFoundError: Specific Hugging Face error if the dataset or split is not found.
+ json.JSONDecodeError: If there is an issue in reading or processing the dataset.
- def __delitem__(self, index):
- del self.data[index]
+ Note:
+ Ensure that the dataset structure aligns with the expected field names. The method assumes each dataset entry is a dictionary-like object.
+ """
- def run_evaluation(
- self,
- completion_fn: Callable[[str], str] = None,
- outputs: List[str] = None,
- test_filename: str = None,
- max_retries: int = 3,
- min_success: int = 1,
- metrics: List[BaseMetric] = None,
- ) -> str:
- """Run evaluation with given metrics"""
- if completion_fn is None:
- assert outputs is not None
-
- table: List[List[Any]] = []
-
- headers: List[str] = [
- "Test Passed",
- "Metric Name",
- "Score",
- "Output",
- "Expected output",
- "Message",
- ]
- results = run_test(
- test_cases=self.data,
- metrics=metrics,
- raise_error=True,
- max_retries=max_retries,
- min_success=min_success,
- )
- for result in results:
- table.append(
- [
- result.success,
- result.metric_name,
- result.score,
- result.output,
- result.expected_output,
- "",
- ]
- )
- if test_filename is None:
- test_filename = (
- f"test-result-{datetime.now().__str__().replace(' ', '-')}.txt"
- )
- with open(test_filename, "w") as f:
- f.write(tabulate(table, headers=headers))
- print(f"Saved to {test_filename}")
- for t in table:
- assert t[0] == True, t[-1]
- return test_filename
-
- def review(self):
- """A bulk editor for reviewing synthetic data."""
try:
- from dash import (
- Dash,
- Input,
- Output,
- State,
- callback,
- dash_table,
- dcc,
- html,
- )
- except ModuleNotFoundError:
- raise Exception(
- """You will need to run `pip install dash` to be able to review tests that were automatically created."""
+ from datasets import load_dataset
+ except ImportError:
+ raise ImportError(
+ "The 'datasets' library is missing. Please install it using pip: pip install datasets"
)
+ hf_dataset = load_dataset(dataset_name, split=split)
- table_data = [
- {"input": x.query, "expected_output": x.expected_output}
- for x in self.data
- ]
- app = Dash(
- __name__,
- external_stylesheets=[
- "https://cdn.jsdelivr.net/npm/bootswatch@5.3.1/dist/darkly/bootstrap.min.css"
- ],
- )
-
- app.layout = html.Div(
- [
- html.H1("Bulk Review Test Cases", style={"marginLeft": "20px"}),
- html.Button(
- "Add Test case",
- id="editing-rows-button",
- n_clicks=0,
- style={
- "padding": "8px",
- "backgroundColor": "purple", # Added purple background color
- "color": "white",
- "border": "2px solid purple", # Added purple border
- "marginLeft": "20px",
- },
- ),
- html.Div(
- dash_table.DataTable(
- id="adding-rows-table",
- columns=[
- {
- "name": c.title().replace("_", " "),
- "id": c,
- "deletable": True,
- "renamable": True,
- }
- for i, c in enumerate(["input", "expected_output"])
- ],
- data=table_data,
- editable=True,
- row_deletable=True,
- style_data_conditional=[
- {
- "if": {"row_index": "odd"},
- "backgroundColor": "rgb(40, 40, 40)",
- "color": "white",
- },
- {
- "if": {"row_index": "even"},
- "backgroundColor": "rgb(30, 30, 30)",
- "color": "white",
- },
- {
- "if": {"state": "selected"},
- "backgroundColor": "white",
- "color": "white",
- },
- ],
- style_header={
- "backgroundColor": "rgb(30, 30, 30)",
- "color": "white",
- "fontWeight": "bold",
- "padding": "10px", # Added padding
- },
- style_cell={
- "padding": "10px", # Added padding
- "whiteSpace": "pre-wrap", # Wrap cell contents
- "maxHeight": "200px",
- },
- ),
- style={"padding": "20px"}, # Added padding
- ),
- html.Div(style={"margin-top": "20px"}),
- html.Button(
- "Save To CSV",
- id="save-button",
- n_clicks=0,
- style={
- "padding": "8px",
- "backgroundColor": "purple", # Added purple background color
- "color": "white",
- "border": "2px solid purple", # Added purple border
- "marginLeft": "20px",
- },
- ),
- dcc.Input(
- id="filename-input",
- type="text",
- placeholder="Enter filename (.csv format)",
- style={
- "padding": "8px",
- "backgroundColor": "rgb(30, 30, 30)",
- "color": "white",
- "marginLeft": "20px",
- "border": "2px solid purple", # Added purple border
- "width": "200px", # Edited width
- },
- value="review-test.csv",
- ),
- html.Div(id="code-output"),
- ],
- style={"padding": "20px"}, # Added padding
- )
-
- @callback(
- Output("adding-rows-table", "data"),
- Input("editing-rows-button", "n_clicks"),
- State("adding-rows-table", "data"),
- State("adding-rows-table", "columns"),
- )
- def add_row(n_clicks, rows, columns):
- if n_clicks > 0:
- rows.append({c["id"]: "" for c in columns})
- return rows
-
- @callback(
- Output("save-button", "n_clicks"),
- Input("save-button", "n_clicks"),
- State("adding-rows-table", "data"),
- State("adding-rows-table", "columns"),
- State("filename-input", "value"),
- )
- def save_data(n_clicks, rows, columns, filename):
- if n_clicks > 0:
- import csv
-
- with open(filename, "w", newline="") as f:
- writer = csv.DictWriter(
- f, fieldnames=[c["id"] for c in columns]
- )
- writer.writeheader()
- writer.writerows(rows)
- return n_clicks
-
- @app.callback(
- Output("code-output", "children"),
- Input("save-button", "n_clicks"),
- State("filename-input", "value"),
- )
- def show_code(n_clicks, filename):
- if n_clicks > 0:
- code = f"""
- from deepeval.dataset import EvaluationDataset
-
- # Replace 'filename.csv' with the actual filename
- ds = EvaluationDataset.from_csv('{filename}')
-
- # Access the data in the CSV file
- # For example, you can print a few rows
- print(ds.sample())
- """
- return html.Div(
- [
- html.P(
- "Code to load the CSV file back into a dataset for testing:"
- ),
- html.Pre(code, className="language-python"),
- ],
- style={"padding": "20px"}, # Added padding
+ # Process each entry in the dataset
+ for entry in hf_dataset:
+ if (
+ input_field_name not in entry
+ or actual_output_field_name not in entry
+ ):
+ raise ValueError(
+ "Required fields are missing in one or more dataset entries"
)
- else:
- return ""
-
- app.run(debug=False)
- def add_evaluation_query_answer_pairs(
- self,
- openai_api_key: str,
- context: str,
- n: int = 3,
- model: str = "openai/gpt-3.5-turbo",
- ):
- """Utility function to create an evaluation dataset using ChatGPT."""
- new_dataset = create_evaluation_query_answer_pairs(
- openai_api_key=openai_api_key, context=context, n=n, model=model
- )
- self.data += new_dataset.data
- print(f"Added {len(new_dataset.data)}!")
+ input = entry[input_field_name]
+ actual_output = entry[actual_output_field_name]
+ expected_output = entry.get(expected_output_field_name)
+ context = entry.get(context_field_name)
-
-def make_chat_completion_request(prompt: str, openai_api_key: str):
- import openai
-
- openai.api_key = openai_api_key
- response = openai.ChatCompletion.create(
- model="gpt-3.5-turbo",
- messages=[
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": prompt},
- ],
- )
- return response.choices[0].message.content
-
-
-def generate_chatgpt_output(prompt: str, openai_api_key: str) -> str:
- max_retries = 3
- retry_delay = 1
- for attempt in range(max_retries):
- try:
- expected_output = make_chat_completion_request(
- prompt=prompt, openai_api_key=openai_api_key
- )
- break
- except Exception as e:
- print(f"Error occurred: {e}")
- if attempt < max_retries - 1:
- print(f"Retrying in {retry_delay} seconds...")
- time.sleep(retry_delay)
- retry_delay *= 2
- else:
- raise
-
- return expected_output
-
-
-def create_evaluation_query_answer_pairs(
- openai_api_key: str,
- context: str,
- n: int = 3,
- model: str = "openai/gpt-3.5-turbo",
-) -> EvaluationDataset:
- """Utility function to create an evaluation dataset using GPT."""
- prompt = f"""You are generating {n} sets of of query-answer pairs to create an evaluation dataset based on the below context.
-Context: {context}
-
-Respond in JSON format in 1 single line without white spaces an array of JSON with the keys `query` and `answer`. Do not use any other keys in the response.
-JSON:"""
- for _ in range(3):
- try:
- responses = generate_chatgpt_output(
- prompt, openai_api_key=openai_api_key
+ self.add_test_case(
+ LLMTestCase(
+ input=input,
+ actual_output=actual_output,
+ expected_output=expected_output,
+ context=context,
+ )
)
- responses = json.loads(responses)
- break
- except Exception as e:
- print(e)
- return EvaluationDataset(test_cases=[])
-
- test_cases = []
- for response in responses:
- test_case = LLMTestCase(
- input=response["query"],
- expected_output=response["answer"],
- context=context,
- # store this as None for now
- actual_output="-",
- )
- test_cases.append(test_case)
-
- dataset = EvaluationDataset(test_cases=test_cases)
- return dataset
diff --git a/deepeval/evaluator.py b/deepeval/evaluator.py
index 63322ce9f..74d80a10f 100644
--- a/deepeval/evaluator.py
+++ b/deepeval/evaluator.py
@@ -1,16 +1,15 @@
"""Function for running test
"""
-import os
-import warnings
-from typing import List, Optional, Union
+
+from typing import List
import time
from dataclasses import dataclass
-from .retry import retry
+import copy
-from .metrics import BaseMetric
-from .test_case import LLMTestCase, TestCase
+from deepeval.progress_context import progress_context
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
from deepeval.test_run import test_run_manager
-import sys
@dataclass
@@ -18,110 +17,122 @@ class TestResult:
"""Returned from run_test"""
success: bool
- score: float
- metric_name: str
- query: str
- output: str
+ metrics: List[BaseMetric]
+ input: str
+ actual_output: str
expected_output: str
- metadata: Optional[dict]
- context: str
-
- def __post_init__(self):
- """Ensures score is between 0 and 1 after initialization"""
- original_score = self.score
- self.score = min(max(0, self.score), 1)
- if self.score != original_score:
- warnings.warn(
- "The score was adjusted to be within the range [0, 1]."
- )
-
- def __gt__(self, other: "TestResult") -> bool:
- """Greater than comparison based on score"""
- return self.score > other.score
-
- def __lt__(self, other: "TestResult") -> bool:
- """Less than comparison based on score"""
- return self.score < other.score
+ context: List[str]
def create_test_result(
test_case: LLMTestCase,
success: bool,
- metric: float,
+ metrics: list[BaseMetric],
) -> TestResult:
if isinstance(test_case, LLMTestCase):
return TestResult(
success=success,
- score=metric.score,
- metric_name=metric.__name__,
- query=test_case.input if test_case.input else "-",
- output=test_case.actual_output if test_case.actual_output else "-",
- expected_output=test_case.expected_output
- if test_case.expected_output
- else "-",
- metadata=None,
+ metrics=metrics,
+ input=test_case.input,
+ actual_output=test_case.actual_output,
+ expected_output=test_case.expected_output,
context=test_case.context,
)
else:
raise ValueError("TestCase not supported yet.")
-def run_test(
- test_cases: Union[TestCase, LLMTestCase, List[LLMTestCase]],
+def execute_test(
+ test_cases: List[LLMTestCase],
metrics: List[BaseMetric],
- max_retries: int = 1,
- delay: int = 1,
- min_success: int = 1,
- raise_error: bool = False,
+ save_to_disk: bool = False,
) -> List[TestResult]:
- if isinstance(test_cases, TestCase):
- test_cases = [test_cases]
-
- test_results = []
+ test_results: TestResult = []
+ test_run_manager.save_to_disk = save_to_disk
+ count = 0
for test_case in test_cases:
- failed_metrics = []
+ success = True
for metric in metrics:
test_start_time = time.perf_counter()
- # score = metric.measure(test_case)
- metric.score = metric.measure(test_case)
- success = metric.is_successful()
+ metric.measure(test_case)
test_end_time = time.perf_counter()
run_duration = test_end_time - test_start_time
- # metric.score = score
test_run_manager.get_test_run().add_llm_test_case(
test_case=test_case,
metrics=[metric],
run_duration=run_duration,
+ index=count,
)
test_run_manager.save_test_run()
- test_result = create_test_result(test_case, success, metric)
- test_results.append(test_result)
- if not success:
- failed_metrics.append((metric.__name__, metric.score))
+ if not metric.is_successful():
+ success = False
- if raise_error and failed_metrics:
- raise AssertionError(
- f"Metrics {', '.join([f'{name} (Score: {score})' for name, score in failed_metrics])} failed."
+ count += 1
+ test_result = create_test_result(
+ test_case, success, copy.deepcopy(metrics)
)
+ test_results.append(test_result)
return test_results
-def assert_test(
- test_cases: Union[LLMTestCase, List[LLMTestCase]],
+def run_test(
+ test_case: LLMTestCase,
metrics: List[BaseMetric],
- max_retries: int = 1,
- delay: int = 1,
- min_success: int = 1,
) -> List[TestResult]:
- """Assert a test"""
- return run_test(
- test_cases=test_cases,
- metrics=metrics,
- max_retries=max_retries,
- delay=delay,
- min_success=min_success,
- raise_error=True,
- )
+ with progress_context("Executing run_test()..."):
+ test_result = execute_test([test_case], metrics, False)[0]
+ print_test_result(test_result)
+ print("\n" + "-" * 70)
+ return test_result
+
+
+def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]):
+ # len(execute_test(...)) is always 1 for assert_test
+ test_result = execute_test([test_case], metrics, True)[0]
+ if not test_result.success:
+ failed_metrics = [
+ metric
+ for metric in test_result.metrics
+ if not metric.is_successful()
+ ]
+ failed_metrics_str = ", ".join(
+ [
+ f"{metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score})"
+ for metric in failed_metrics
+ ]
+ )
+ raise AssertionError(f"Metrics {failed_metrics_str} failed.")
+
+
+def evaluate(test_cases: List[LLMTestCase], metrics: List[BaseMetric]):
+ with progress_context("Evaluating testcases..."):
+ test_results = execute_test(test_cases, metrics, True)
+ for test_result in test_results:
+ print_test_result(test_result)
+ print("\n" + "-" * 70)
+
+ test_run_manager.wrap_up_test_run(display_table=False)
+ return test_results
+
+
+def print_test_result(test_result: TestResult):
+ print("\n" + "=" * 70 + "\n")
+ print("Metrics Summary\n")
+ for metric in test_result.metrics:
+ if not metric.is_successful():
+ print(
+ f" - ❌ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score})"
+ )
+ else:
+ print(
+ f" - ✅ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score})"
+ )
+
+ print("\nFor test case:\n")
+ print(f" - input: {test_result.input}")
+ print(f" - actual output: {test_result.actual_output}")
+ print(f" - expected output: {test_result.expected_output}")
+ print(f" - context: {test_result.context}")
diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
index 68e51d1ab..bb4a2301c 100644
--- a/deepeval/metrics/__init__.py
+++ b/deepeval/metrics/__init__.py
@@ -1 +1,8 @@
-from .base_metric import *
+from .base_metric import BaseMetric
+from .answer_relevancy import AnswerRelevancyMetric
+from .base_metric import BaseMetric
+from .conceptual_similarity import ConceptualSimilarityMetric
+from .factual_consistency import FactualConsistencyMetric
+from .judgemental_gpt import JudgementalGPT
+from .llm_eval_metric import LLMEvalMetric
+from .ragas_metric import RagasMetric
diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py
index a26f3c3c2..9c4219109 100644
--- a/deepeval/metrics/answer_relevancy.py
+++ b/deepeval/metrics/answer_relevancy.py
@@ -1,6 +1,5 @@
from deepeval.singleton import Singleton
from deepeval.test_case import LLMTestCase
-from deepeval.evaluator import assert_test
from deepeval.metrics.base_metric import BaseMetric
import numpy as np
@@ -77,31 +76,3 @@ def is_successful(self) -> bool:
@property
def __name__(self):
return "Answer Relevancy"
-
-
-def assert_answer_relevancy(
- query: str,
- output: str,
- minimum_score: float = 0.5,
- model_type: str = "default",
-):
- metric = AnswerRelevancyMetric(
- minimum_score=minimum_score, model_type=model_type
- )
- test_case = LLMTestCase(input=query, actual_output=output)
- assert_test(test_case, metrics=[metric])
-
-
-def is_answer_relevant(
- query: str,
- output: str,
- minimum_score: float = 0.5,
- model_type: str = "default",
-) -> bool:
- """Check if the output is relevant to the query."""
-
- metric = AnswerRelevancyMetric(
- minimum_score=minimum_score, model_type=model_type
- )
- test_case = LLMTestCase(input=query, actual_output=output)
- return metric.measure(test_case) >= minimum_score
diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py
index 138dcb909..8299e2d35 100644
--- a/deepeval/metrics/base_metric.py
+++ b/deepeval/metrics/base_metric.py
@@ -1,11 +1,13 @@
from abc import abstractmethod
from deepeval.test_case import LLMTestCase
+from typing import Optional
class BaseMetric:
# set an arbitrary minimum score that will get over-ridden later
score: float = 0
+ reason: Optional[str] = None
@property
def minimum_score(self) -> float:
@@ -18,7 +20,7 @@ def minimum_score(self, value: float):
# Measure function signature is subject to be different - not sure
# how applicable this is - might need a better abstraction
@abstractmethod
- def measure(self, test_case: LLMTestCase, *args, **kwargs):
+ def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
raise NotImplementedError
def _get_init_values(self):
diff --git a/deepeval/metrics/bias_classifier.py b/deepeval/metrics/bias_classifier.py
index a8571d180..45d510a6a 100644
--- a/deepeval/metrics/bias_classifier.py
+++ b/deepeval/metrics/bias_classifier.py
@@ -6,10 +6,9 @@
import warnings
from typing import Optional, List
-from deepeval.metrics.base_metric import BaseMetric
+from deepeval.metrics.base_metric import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from ..evaluator import assert_test
class UnBiasedMetric(BaseMetric):
@@ -84,27 +83,3 @@ def assert_successful(self):
@property
def __name__(self):
return "Bias Score"
-
-
-def assert_unbiased(
- evaluation_params: List[LLMTestCaseParams],
- input: str,
- actual_output: str,
- minimum_score: float = 0.5,
-): # see paper for rationale https://arxiv.org/pdf/2208.05777.pdf
- metric = UnBiasedMetric(
- evaluation_params=evaluation_params, minimum_score=minimum_score
- )
- test_case = LLMTestCase(input=input, actual_output=actual_output)
- assert_test(test_case, [metric])
-
-
-def is_unbiased(
- text: str,
- minimum_score: float = 0.5,
-) -> bool:
- """Check if the text is unbiased."""
-
- metric = UnBiasedMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(input="placeholder", actual_output=text)
- return metric.measure(test_case) >= minimum_score
diff --git a/deepeval/metrics/conceptual_similarity.py b/deepeval/metrics/conceptual_similarity.py
index aefb6034f..49131c6b7 100644
--- a/deepeval/metrics/conceptual_similarity.py
+++ b/deepeval/metrics/conceptual_similarity.py
@@ -5,7 +5,6 @@
from deepeval.singleton import Singleton
from deepeval.test_case import LLMTestCase
from deepeval.utils import cosine_similarity
-from deepeval.evaluator import assert_test
from deepeval.progress_context import progress_context
from deepeval.metrics.base_metric import BaseMetric
@@ -39,35 +38,12 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.actual_output, test_case.expected_output
)
self.score = cosine_similarity(vectors[0], vectors[1])
+ self.success = self.score >= self.minimum_score
return float(self.score)
def is_successful(self) -> bool:
- return bool(self.score >= self.minimum_score)
+ return self.success
@property
def __name__(self):
return "Conceptual Similarity"
-
-
-def assert_conceptual_similarity(
- output: str, expected_output: str, minimum_score=0.7
-):
- metric = ConceptualSimilarityMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input="placeholder",
- actual_output=output,
- expected_output=expected_output,
- )
- assert_test(test_case, [metric])
-
-
-def is_conceptually_similar(
- output: str, expected_output: str, minimum_score=0.7
-) -> bool:
- metric = ConceptualSimilarityMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input="placeholder",
- actual_output=output,
- expected_output=expected_output,
- )
- return metric.measure(test_case) >= minimum_score
diff --git a/deepeval/metrics/factual_consistency.py b/deepeval/metrics/factual_consistency.py
index e64f0a444..a4a6ef6e5 100644
--- a/deepeval/metrics/factual_consistency.py
+++ b/deepeval/metrics/factual_consistency.py
@@ -3,7 +3,6 @@
from deepeval.test_case import LLMTestCase
from deepeval.utils import chunk_text, softmax
from deepeval.metrics.base_metric import BaseMetric
-from deepeval.evaluator import assert_test
from deepeval.progress_context import progress_context
from sentence_transformers import CrossEncoder
@@ -67,27 +66,3 @@ def is_successful(self) -> bool:
@property
def __name__(self):
return "Factual Consistency"
-
-
-def is_factually_consistent(
- output: str, context: str, minimum_score: float = 0.3
-) -> bool:
- """Check if the output is factually consistent with the context."""
-
- metric = FactualConsistencyMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input="placeholder", actual_output=output, context=context
- )
- return metric.measure(test_case) >= minimum_score
-
-
-def assert_factual_consistency(
- output: str, context: list[str], minimum_score: float = 0.3
-):
- """Assert that the output is factually consistent with the context."""
-
- metric = FactualConsistencyMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input="placeholder", actual_output=output, context=context
- )
- assert_test(test_case, [metric])
diff --git a/deepeval/metrics/judgemental_gpt.py b/deepeval/metrics/judgemental_gpt.py
new file mode 100644
index 000000000..b735993be
--- /dev/null
+++ b/deepeval/metrics/judgemental_gpt.py
@@ -0,0 +1,71 @@
+from deepeval.metrics.base_metric import BaseMetric
+from deepeval.test_case import LLMTestCaseParams, LLMTestCase
+from typing import List
+from pydantic import BaseModel
+from deepeval.api import Api
+
+
+class JudgementalGPTResponse(BaseModel):
+ score: float
+ reason: str
+
+
+class JudgementalGPTRequest(BaseModel):
+ text: str
+ criteria: str
+
+
+class JudgementalGPT(BaseMetric):
+ def __init__(
+ self,
+ name: str,
+ criteria: str,
+ evaluation_params: List[LLMTestCaseParams],
+ minimum_score: float = 0.5,
+ ):
+ self.criteria = criteria
+ self.name = name
+ self.evaluation_params = evaluation_params
+ self.minimum_score = minimum_score
+ self.success = None
+ self.reason = None
+
+ @property
+ def __name__(self):
+ return self.name
+
+ def measure(self, test_case: LLMTestCase):
+ text = """"""
+ for param in self.evaluation_params:
+ value = getattr(test_case, param.value)
+ text += f"{param.value}: {value} \n\n"
+
+ judgemental_gpt_request_data = JudgementalGPTRequest(
+ text=text, criteria=self.criteria
+ )
+
+ try:
+ body = judgemental_gpt_request_data.model_dump(
+ by_alias=True, exclude_none=True
+ )
+ except AttributeError:
+ body = judgemental_gpt_request_data.dict(
+ by_alias=True, exclude_none=True
+ )
+ api = Api()
+ result = api.post_request(
+ endpoint="/v1/judgemental-gpt",
+ body=body,
+ )
+ response = JudgementalGPTResponse(
+ score=result["score"],
+ reason=result["reason"],
+ )
+ self.reason = response.reason
+ self.score = response.score / 10
+ self.success = self.score >= self.minimum_score
+
+ return self.score
+
+ def is_successful(self):
+ return self.success
diff --git a/deepeval/metrics/llm_eval_metric.py b/deepeval/metrics/llm_eval_metric.py
index db00cf30e..b7c8a0d73 100644
--- a/deepeval/metrics/llm_eval_metric.py
+++ b/deepeval/metrics/llm_eval_metric.py
@@ -8,6 +8,8 @@
from deepeval.chat_completion.retry import call_openai_with_retry
from pydantic import BaseModel
import openai
+from langchain.chat_models import ChatOpenAI
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
class LLMEvalMetricResponse(BaseModel):
@@ -20,15 +22,20 @@ def __init__(
name: str,
criteria: str,
evaluation_params: List[LLMTestCaseParams],
- model: Optional[str] = "gpt-4",
+ evaluation_steps: str = "",
+ model: Optional[str] = "gpt-4-1106-preview",
minimum_score: float = 0.5,
+ **kwargs,
):
self.criteria = criteria
self.name = name
self.model = model
- self.evaluation_steps = ""
+ self.evaluation_steps = evaluation_steps
self.evaluation_params = evaluation_params
self.minimum_score = minimum_score
+ self.deployment_id = None
+ if "deployment_id" in kwargs:
+ self.deployment_id = kwargs["deployment_id"]
@property
def __name__(self):
@@ -51,10 +58,9 @@ def measure(self, test_case: LLMTestCase):
self.evaluation_steps = self.generate_evaluation_steps()
score = self.evaluate(test_case)
- score = float(score) * 2 / 10
-
+ self.score = float(score) * 2 / 10
self.success = score >= self.minimum_score
- return score
+ return self.score
def is_successful(self):
return self.success
@@ -62,20 +68,16 @@ def is_successful(self):
def generate_evaluation_steps(self):
prompt: dict = evaluation_steps_template.format(criteria=self.criteria)
- res = call_openai_with_retry(
- lambda: openai.ChatCompletion.create(
- model=self.model,
- messages=[
- {
- "role": "system",
- "content": "You are a helpful assistant.",
- },
- {"role": "user", "content": prompt},
- ],
- )
+ model_kwargs = {}
+ if self.deployment_id is not None:
+ model_kwargs["deployment_id"] = self.deployment_id
+
+ chat_completion = ChatOpenAI(
+ model_name=self.model, model_kwargs=model_kwargs
)
- return res.choices[0].message.content
+ res = call_openai_with_retry(lambda: chat_completion.invoke(prompt))
+ return res.content
def evaluate(self, test_case: LLMTestCase):
text = """"""
@@ -89,24 +91,29 @@ def evaluate(self, test_case: LLMTestCase):
text=text,
)
+ model_kwargs = {
+ "top_p": 1,
+ "frequency_penalty": 0,
+ "stop": None,
+ "presence_penalty": 0,
+ }
+ if self.deployment_id is not None:
+ model_kwargs["deployment_id"] = self.deployment_id
+
+ chat_completion = ChatOpenAI(
+ model_name=self.model, max_tokens=5, n=20, model_kwargs=model_kwargs
+ )
+
res = call_openai_with_retry(
- lambda: openai.ChatCompletion.create(
- model=self.model,
- messages=[{"role": "system", "content": prompt}],
- max_tokens=5,
- top_p=1,
- frequency_penalty=0,
- presence_penalty=0,
- stop=None,
- # logprobs=5,
- n=20,
+ lambda: chat_completion.generate_prompt(
+ [chat_completion._convert_input(prompt)]
)
)
total_scores = 0
count = 0
- for content in res.choices:
+ for content in res.generations[0]:
try:
total_scores += float(content.message.content)
count += 1
diff --git a/deepeval/metrics/overall_score.py b/deepeval/metrics/overall_score.py
deleted file mode 100644
index a71d1da30..000000000
--- a/deepeval/metrics/overall_score.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""Overall Score
-"""
-
-from deepeval.test_case import LLMTestCase
-from deepeval.singleton import Singleton
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics.answer_relevancy import AnswerRelevancyMetric
-from deepeval.metrics.conceptual_similarity import ConceptualSimilarityMetric
-from deepeval.metrics.factual_consistency import FactualConsistencyMetric
-from deepeval.metrics.base_metric import BaseMetric
-from deepeval.evaluator import assert_test
-
-
-class OverallScoreMetric(BaseMetric, metaclass=Singleton):
- def __init__(self, minimum_score: float = 0.5):
- self.minimum_score = minimum_score
- self.answer_relevancy = AnswerRelevancyMetric()
- self.factual_consistency_metric = FactualConsistencyMetric()
- self.conceptual_similarity_metric = ConceptualSimilarityMetric()
-
- def __call__(self, test_case: LLMTestCase):
- score = self.measure(test_case=test_case)
- self.success = score > self.minimum_score
- return score
-
- def measure(
- self,
- test_case: LLMTestCase,
- ) -> float:
- metadata = {}
- if test_case.context is not None:
- factual_consistency_score = self.factual_consistency_metric.measure(
- test_case
- )
- metadata["factual_consistency"] = float(factual_consistency_score)
-
- if test_case.input is not None:
- answer_relevancy_score = self.answer_relevancy.measure(test_case)
- metadata["answer_relevancy"] = float(answer_relevancy_score)
-
- if test_case.expected_output is not None:
- conceptual_similarity_score = (
- self.conceptual_similarity_metric.measure(test_case)
- )
- metadata["conceptual_similarity"] = float(
- conceptual_similarity_score
- )
-
- overall_score = sum(metadata.values()) / len(metadata)
-
- self.success = bool(overall_score > self.minimum_score)
- return overall_score
-
- def is_successful(self) -> bool:
- return self.success
-
- @property
- def __name__(self):
- return "Overall Score"
-
-
-def assert_overall_score(
- query: str,
- output: str,
- expected_output: str,
- context: str,
- minimum_score: float = 0.5,
-):
- metric = OverallScoreMetric(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input=query,
- actual_output=output,
- expected_output=expected_output,
- context=context,
- )
- assert_test(test_case, metrics=[metric])
diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py
index 5e53a1452..989011bbe 100644
--- a/deepeval/metrics/ragas_metric.py
+++ b/deepeval/metrics/ragas_metric.py
@@ -55,7 +55,7 @@ def measure(self, test_case: LLMTestCase):
context_relevancy_score = scores["context_relevancy"]
self.success = context_relevancy_score >= self.minimum_score
self.score = context_relevancy_score
- return context_relevancy_score
+ return self.score
def is_successful(self):
return self.success
@@ -108,7 +108,7 @@ def measure(self, test_case: LLMTestCase):
answer_relevancy_score = scores["answer_relevancy"]
self.success = answer_relevancy_score >= self.minimum_score
self.score = answer_relevancy_score
- return answer_relevancy_score
+ return self.score
def is_successful(self):
return self.success
@@ -159,7 +159,7 @@ def measure(self, test_case: LLMTestCase):
faithfulness_score = scores["faithfulness"]
self.success = faithfulness_score >= self.minimum_score
self.score = faithfulness_score
- return faithfulness_score
+ return self.score
def is_successful(self):
return self.success
@@ -212,7 +212,7 @@ def measure(self, test_case: LLMTestCase):
context_recall_score = scores["context_recall"]
self.success = context_recall_score >= self.minimum_score
self.score = context_recall_score
- return context_recall_score
+ return self.score
def is_successful(self):
return self.success
@@ -265,7 +265,7 @@ def measure(self, test_case: LLMTestCase):
harmfulness_score = scores["harmfulness"]
self.success = harmfulness_score >= self.minimum_score
self.score = harmfulness_score
- return harmfulness_score
+ return self.score
def is_successful(self):
return self.success
@@ -317,7 +317,7 @@ def measure(self, test_case: LLMTestCase):
coherence_score = scores["coherence"]
self.success = coherence_score >= self.minimum_score
self.score = coherence_score
- return coherence_score
+ return self.score
def is_successful(self):
return self.success
@@ -369,7 +369,7 @@ def measure(self, test_case: LLMTestCase):
maliciousness_score = scores["maliciousness"]
self.success = maliciousness_score >= self.minimum_score
self.score = maliciousness_score
- return maliciousness_score
+ return self.score
def is_successful(self):
return self.success
@@ -421,7 +421,7 @@ def measure(self, test_case: LLMTestCase):
correctness_score = scores["correctness"]
self.success = correctness_score >= self.minimum_score
self.score = correctness_score
- return correctness_score
+ return self.score
def is_successful(self):
return self.success
@@ -473,7 +473,7 @@ def measure(self, test_case: LLMTestCase):
conciseness_score = scores["conciseness"]
self.success = conciseness_score >= self.minimum_score
self.score = conciseness_score
- return conciseness_score
+ return self.score
def is_successful(self):
return self.success
@@ -540,7 +540,7 @@ def measure(self, test_case: LLMTestCase):
# 'answer_relevancy': 0.874}
self.success = ragas_score >= self.minimum_score
self.score = ragas_score
- return ragas_score
+ return self.score
def is_successful(self):
return self.success
@@ -548,16 +548,3 @@ def is_successful(self):
@property
def __name__(self):
return "RAGAS"
-
-
-def assert_ragas(
- test_case: LLMTestCase,
- metrics: List[str] = None,
- minimum_score: float = 0.3,
-):
- """Asserts if the Ragas score is above the minimum score"""
- metric = RagasMetric(metrics, minimum_score)
- score = metric.measure(test_case)
- assert (
- score >= metric.minimum_score
- ), f"Ragas score {score} is below the minimum score {metric.minimum_score}"
diff --git a/deepeval/metrics/ranking_similarity.py b/deepeval/metrics/ranking_similarity.py
deleted file mode 100644
index 426ba5112..000000000
--- a/deepeval/metrics/ranking_similarity.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Testing for ranking similarity
-from typing import Any, List, Optional, Union
-
-import numpy as np
-from tqdm import tqdm
-
-from ..test_case import LLMTestCase
-from .base_metric import BaseMetric
-from ..evaluator import assert_test
-
-
-class RBO:
- """
- This class will include some similarity measures between two different
- ranked lists.
- """
-
- def __init__(
- self,
- S: Union[List, np.ndarray],
- T: Union[List, np.ndarray],
- verbose: bool = False,
- ) -> None:
- """
- Initialize the object with the required lists.
- Examples of lists:
- S = ["a", "b", "c", "d", "e"]
- T = ["b", "a", 1, "d"]
-
- Both lists reflect the ranking of the items of interest, for example,
- list S tells us that item "a" is ranked first, "b" is ranked second,
- etc.
-
- Args:
- S, T (list or numpy array): lists with alphanumeric elements. They
- could be of different lengths. Both of the them should be
- ranked, i.e., each element"s position reflects its respective
- ranking in the list. Also we will require that there is no
- duplicate element in each list.
- verbose: If True, print out intermediate results. Default to False.
- """
-
- assert type(S) in [list, np.ndarray]
- assert type(T) in [list, np.ndarray]
-
- assert len(S) == len(set(S))
- assert len(T) == len(set(T))
-
- self.S, self.T = S, T
- self.N_S, self.N_T = len(S), len(T)
- self.verbose = verbose
- self.p = 0.5 # just a place holder
-
- def assert_p(self, p: float) -> None:
- """Make sure p is between (0, 1), if so, assign it to self.p.
-
- Args:
- p (float): The value p.
- """
- assert 0.0 < p < 1.0, "p must be between (0, 1)"
- self.p = p
-
- def _bound_range(self, value: float) -> float:
- """Bounds the value to [0.0, 1.0]."""
-
- try:
- assert 0 <= value <= 1 or np.isclose(1, value)
- return value
-
- except AssertionError:
- print("Value out of [0, 1] bound, will bound it.")
- larger_than_zero = max(0.0, value)
- less_than_one = min(1.0, larger_than_zero)
- return less_than_one
-
- def rbo(
- self,
- k: Optional[float] = None,
- p: float = 1.0,
- ext: bool = False,
- ) -> float:
- """
- This the weighted non-conjoint measures, namely, rank-biased overlap.
- Unlike Kendall tau which is correlation based, this is intersection
- based.
- The implementation if from Eq. (4) or Eq. (7) (for p != 1) from the
- RBO paper: http://www.williamwebber.com/research/papers/wmz10_tois.pdf
-
- If p = 1, it returns to the un-bounded set-intersection overlap,
- according to Fagin et al.
- https://researcher.watson.ibm.com/researcher/files/us-fagin/topk.pdf
-
- The fig. 5 in that RBO paper can be used as test case.
- Note there the choice of p is of great importance, since it
- essentially control the "top-weightness". Simply put, to an extreme,
- a small p value will only consider first few items, whereas a larger p
- value will consider more items. See Eq. (21) for quantitative measure.
-
- Args:
- k: The depth of evaluation.
- p: Weight of each agreement at depth d:
- p**(d-1). When set to 1.0, there is no weight, the rbo returns
- to average overlap.
- ext: If True, we will extrapolate the rbo, as in Eq. (23).
-
- Returns:
- The rbo at depth k (or extrapolated beyond).
- """
-
- if not self.N_S and not self.N_T:
- return 1 # both lists are empty
-
- if not self.N_S or not self.N_T:
- return 0 # one list empty, one non-empty
-
- if k is None:
- k = float("inf")
- k = min(self.N_S, self.N_T, k)
-
- # initialize the agreement and average overlap arrays
- A, AO = [0] * k, [0] * k
- if p == 1.0:
- weights = [1.0 for _ in range(k)]
- else:
- self.assert_p(p)
- weights = [1.0 * (1 - p) * p**d for d in range(k)]
-
- # using dict for O(1) look up
- S_running, T_running = {self.S[0]: True}, {self.T[0]: True}
- A[0] = 1 if self.S[0] == self.T[0] else 0
- AO[0] = weights[0] if self.S[0] == self.T[0] else 0
-
- for d in tqdm(range(1, k), disable=~self.verbose):
- tmp = 0
- # if the new item from S is in T already
- if self.S[d] in T_running:
- tmp += 1
- # if the new item from T is in S already
- if self.T[d] in S_running:
- tmp += 1
- # if the new items are the same, which also means the previous
- # two cases did not happen
- if self.S[d] == self.T[d]:
- tmp += 1
-
- # update the agreement array
- A[d] = 1.0 * ((A[d - 1] * d) + tmp) / (d + 1)
-
- # update the average overlap array
- if p == 1.0:
- AO[d] = ((AO[d - 1] * d) + A[d]) / (d + 1)
- else: # weighted average
- AO[d] = AO[d - 1] + weights[d] * A[d]
-
- # add the new item to the running set (dict)
- S_running[self.S[d]] = True
- T_running[self.T[d]] = True
-
- if ext and p < 1:
- return self._bound_range(AO[-1] + A[-1] * p**k)
-
- return self._bound_range(AO[-1])
-
-
-class RankingSimilarity(BaseMetric):
- def __init__(self, minimum_score: float = 0.1):
- self.minimum_score = minimum_score
-
- def __call__(self, test_case: LLMTestCase):
- score = self.measure(test_case.retrieval_context, test_case.context)
- return score
-
- def measure(self, test_case: LLMTestCase):
- list_1 = [str(x) for x in test_case.retrieval_context]
- list_2 = [str(x) for x in test_case.context]
- scorer = RBO(list_1, list_2)
- result = scorer.rbo(p=0.9, ext=True)
- self.success = result > self.minimum_score
- return result
-
- def is_successful(self):
- return self.success
-
- @property
- def __name__(self):
- return "Ranking Similarity"
-
-
-def assert_ranking_similarity(
- input: str,
- actual_output: str,
- context: List[str],
- retrieval_context: List[str],
- minimum_score,
- expected_output: str = "-",
-):
- scorer = RankingSimilarity(minimum_score=minimum_score)
- test_case = LLMTestCase(
- input=input,
- actual_output=actual_output,
- expected_output=expected_output,
- context=context,
- retrieval_context=retrieval_context,
- )
- assert_test(test_case, [scorer])
diff --git a/deepeval/metrics/scoring.py b/deepeval/metrics/scoring.py
index 4278c7e25..1ea499cfa 100644
--- a/deepeval/metrics/scoring.py
+++ b/deepeval/metrics/scoring.py
@@ -7,6 +7,7 @@
from deepeval.metrics._summac_model import SummaCZS
+# TODO: More scores are to be added
class Scorer:
"""This class calculates various Natural Language Processing (NLP) evaluation score.
@@ -14,8 +15,6 @@ class Scorer:
Which also uses an external model (BERTScore) in the scoring logic.
"""
- # Todo: More metrics are to be added
-
@classmethod
def rouge_score(
cls, target: str, prediction: str, score_type: str
@@ -208,7 +207,54 @@ def PII_score(
raise NotImplementedError()
@classmethod
- def toxic_score(
- cls, target: str, prediction: str, model: Optional[Any] = None
- ) -> float:
- raise NotImplementedError()
+ def neural_toxic_score(
+ cls, prediction: str, model: Optional[Any] = None
+ ) -> Union[float, dict]:
+ """
+ Calculate the toxicity score of a given text prediction using the Detoxify model.
+
+ Args:
+ prediction (str): The text prediction to evaluate for toxicity.
+ model (Optional[str], optional): The variant of the Detoxify model to use.
+ Available variants: 'original', 'unbiased', 'multilingual'.
+ If not provided, the 'original' variant is used by default.
+
+ Returns:
+ Union[float, dict]: The mean toxicity score, ranging from 0 (non-toxic) to 1 (highly toxic),
+ and also a dictionary containing different types of toxicity score.
+
+ For each model, we get mean toxicity score and a dictionary containing different toxicity score types.
+ Examples:
+ If model is 'original', we get the a dict with the following keys:
+ - 'toxicity',
+ - 'severe_toxicity',
+ - 'obscene',
+ - 'threat'
+ - 'insult'
+ - 'identity_attack'
+
+ If model is 'unbiased', we get a dict with the same as keys as 'original', but
+ along with `sexual_explicit`.
+
+ If the model is 'multilingual', we get a dict same as the unbiasd one.
+ """
+ try:
+ from detoxify import Detoxify
+ except ImportError as e:
+ print(e)
+
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+ if model is not None:
+ assert model in [
+ "original",
+ "unbiased",
+ "multilingual",
+ ], "Invalid model. Available variants: original, unbiased, multilingual"
+ detoxify_model = Detoxify(model, device=device)
+ else:
+ detoxify_model = Detoxify("original", device=device)
+ toxicity_score_dict = detoxify_model.predict(prediction)
+ mean_toxicity_score = sum(list(toxicity_score_dict.values())) / len(
+ toxicity_score_dict
+ )
+ return mean_toxicity_score, toxicity_score_dict
diff --git a/deepeval/metrics/toxic_classifier.py b/deepeval/metrics/toxic_classifier.py
index 71fd5a8e1..48a8eab61 100644
--- a/deepeval/metrics/toxic_classifier.py
+++ b/deepeval/metrics/toxic_classifier.py
@@ -7,7 +7,6 @@
from deepeval.singleton import Singleton
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics.base_metric import BaseMetric
-from deepeval.evaluator import assert_test
class DetoxifyModel(metaclass=Singleton):
@@ -74,7 +73,8 @@ def measure(self, test_case: LLMTestCase):
# Check if the average score meets the minimum requirement
self.success = average_score >= self.minimum_score
- return average_score
+ self.score = average_score
+ return self.score
def is_successful(self):
return self.success
@@ -82,16 +82,3 @@ def is_successful(self):
@property
def __name__(self):
return "Toxicity"
-
-
-def assert_non_toxic(
- evaluation_params: List[LLMTestCaseParams],
- input: str,
- actual_output: str,
- minimum_score: float = 0.5,
-):
- metric = NonToxicMetric(
- evaluation_params=evaluation_params, minimum_score=minimum_score
- )
- test_case = LLMTestCase(input=input, actual_output=actual_output)
- assert_test(test_case, [metric])
diff --git a/deepeval/old-dataset.py b/deepeval/old-dataset.py
new file mode 100644
index 000000000..4433fe27b
--- /dev/null
+++ b/deepeval/old-dataset.py
@@ -0,0 +1,593 @@
+"""Class for Evaluation Datasets
+"""
+import json
+import random
+import time
+from collections import UserList
+from datetime import datetime
+from typing import Any, Callable, List, Optional
+
+from tabulate import tabulate
+
+from deepeval.evaluator import run_test
+from deepeval.metrics.base_metric import BaseMetric
+from deepeval.test_case import LLMTestCase
+from dataclasses import asdict
+
+
+class EvaluationDataset(UserList):
+ """Class for Evaluation Datasets - which are a list of test cases"""
+
+ def __init__(self, test_cases: List[LLMTestCase]):
+ self.data: List[LLMTestCase] = test_cases
+
+ @classmethod
+ def from_csv(
+ cls, # Use 'cls' instead of 'self' for class methods
+ csv_filename: str,
+ query_column: Optional[str] = None,
+ expected_output_column: Optional[str] = None,
+ context_column: Optional[str] = None,
+ output_column: Optional[str] = None,
+ id_column: str = None,
+ metrics: List[BaseMetric] = None,
+ ):
+ import pandas as pd
+
+ df = pd.read_csv(csv_filename)
+ if query_column is not None and query_column in df.columns:
+ querys = df[query_column].values
+ else:
+ querys = [None] * len(df)
+ if (
+ expected_output_column is not None
+ and expected_output_column in df.columns
+ ):
+ expected_outputs = df[expected_output_column].values
+ else:
+ expected_outputs = [None] * len(df)
+ if context_column is not None and context_column in df.columns:
+ contexts = df[context_column].values
+ else:
+ contexts = [None] * len(df)
+ if output_column is not None and output_column in df.columns:
+ outputs = df[output_column].values
+ else:
+ outputs = [None] * len(df)
+ if id_column is not None:
+ ids = df[id_column].values
+ else:
+ ids = [None] * len(df)
+
+ # Initialize the 'data' attribute as an empty list
+ cls.data = []
+
+ for i, query_data in enumerate(querys):
+ cls.data.append(
+ LLMTestCase(
+ input=query_data,
+ expected_output=expected_outputs[i],
+ context=contexts[i],
+ id=ids[i] if id_column else None,
+ actual_output=outputs[i] if output_column else None,
+ )
+ )
+ return cls(cls.data)
+
+ def from_test_cases(self, test_cases: list):
+ self.data = test_cases
+
+ @classmethod
+ def from_hf_dataset(
+ cls,
+ dataset_name: str,
+ split: str,
+ query_column: str,
+ expected_output_column: str,
+ context_column: str = None,
+ output_column: str = None,
+ id_column: str = None,
+ ):
+ """
+ Load test cases from a HuggingFace dataset.
+
+ Args:
+ dataset_name (str): The name of the HuggingFace dataset to load.
+ split (str): The split of the dataset to load (e.g., 'train', 'test').
+ query_column (str): The column in the dataset corresponding to the query.
+ expected_output_column (str): The column in the dataset corresponding to the expected output.
+ context_column (str, optional): The column in the dataset corresponding to the context. Defaults to None.
+ output_column (str, optional): The column in the dataset corresponding to the output. Defaults to None.
+ id_column (str, optional): The column in the dataset corresponding to the ID. Defaults to None.
+
+ Returns:
+ EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
+ """
+ try:
+ from datasets import load_dataset
+ except ImportError:
+ raise ImportError(
+ "The 'datasets' library is missing. Please install it using pip: pip install datasets"
+ )
+
+ hf_dataset = load_dataset(dataset_name, split=split)
+ test_cases = []
+
+ for i, row in enumerate(hf_dataset):
+ test_cases.append(
+ LLMTestCase(
+ input=row[query_column],
+ expected_output=row[expected_output_column],
+ context=row[context_column] if context_column else None,
+ actual_output=row[output_column] if output_column else None,
+ id=row[id_column] if id_column else None,
+ )
+ )
+ return cls(test_cases)
+
+ @classmethod
+ def from_json(
+ cls,
+ json_filename: str,
+ query_column: str,
+ expected_output_column: str,
+ context_column: str,
+ output_column: str,
+ id_column: str = None,
+ ):
+ """
+ This is for JSON data in the format of key-value array pairs.
+ {
+ "query": ["What is the customer success number", "What is the customer success number"],
+ "context": ["Context 1", "Context 2"],
+ "output": ["Output 1", "Output 2"]
+ }
+
+ if the JSON data is in a list of dictionaries, use from_json_list
+ """
+ with open(json_filename, "r") as f:
+ data = json.load(f)
+ test_cases = []
+
+ for i, query in enumerate(data[query_column]):
+ test_cases.append(
+ LLMTestCase(
+ input=data[query_column][i],
+ expected_output=data[expected_output_column][i],
+ context=data[context_column][i],
+ actual_output=data[output_column][i],
+ id=data[id_column][i] if id_column else None,
+ )
+ )
+ return cls(test_cases)
+
+ @classmethod
+ def from_json_list(
+ cls,
+ json_filename: str,
+ query_column: str,
+ expected_output_column: str,
+ context_column: str,
+ output_column: str,
+ id_column: str = None,
+ ):
+ """
+ This is for JSON data in the format of a list of dictionaries.
+ [
+ {"query": "What is the customer success number", "expected_output": "What is the customer success number", "context": "Context 1", "output": "Output 1"},
+ ]
+ """
+ with open(json_filename, "r") as f:
+ data = json.load(f)
+ test_cases = []
+ for i, query in enumerate(data):
+ test_cases.append(
+ LLMTestCase(
+ input=data[i][query_column],
+ expected_output=data[i][expected_output_column],
+ context=data[i][context_column],
+ actual_output=data[i][output_column],
+ id=data[i][id_column] if id_column else None,
+ )
+ )
+ return cls(test_cases)
+
+ @classmethod
+ def from_dict(
+ cls,
+ data: List[dict],
+ query_key: str,
+ expected_output_key: str,
+ context_key: str = None,
+ output_key: str = None,
+ id_key: str = None,
+ ):
+ """
+ Load test cases from a list of dictionaries.
+
+ Args:
+ data (List[dict]): The list of dictionaries containing the test case data.
+ query_key (str): The key in each dictionary corresponding to the query.
+ expected_output_key (str): The key in each dictionary corresponding to the expected output.
+ context_key (str, optional): The key in each dictionary corresponding to the context. Defaults to None.
+ output_key (str, optional): The key in each dictionary corresponding to the output. Defaults to None.
+ id_key (str, optional): The key in each dictionary corresponding to the ID. Defaults to None.
+ metrics (List[BaseMetric], optional): The list of metrics to be associated with the test cases. Defaults to None.
+
+ Returns:
+ EvaluationDataset: An instance of EvaluationDataset containing the loaded test cases.
+ """
+ test_cases = []
+ for i, case_data in enumerate(data):
+ test_cases.append(
+ LLMTestCase(
+ input=case_data[query_key],
+ expected_output=case_data[expected_output_key],
+ context=case_data[context_key] if context_key else None,
+ actual_output=case_data[output_key] if output_key else None,
+ id=case_data[id_key] if id_key else None,
+ )
+ )
+ return cls(test_cases)
+
+ def to_dict(self):
+ return [asdict(x) for x in self.data]
+
+ def to_csv(self, csv_filename: str):
+ import pandas as pd
+
+ df = pd.DataFrame(self.data)
+ df.to_csv(csv_filename, index=False)
+
+ def to_json(self, json_filename: str):
+ with open(json_filename, "w") as f:
+ json.dump(self.data, f)
+
+ def from_hf_evals(self):
+ raise NotImplementedError
+
+ def from_df(self):
+ raise NotImplementedError
+
+ def __repr__(self):
+ return f"{self.__class__.__name__}({self.data})"
+
+ def sample(self, n: int = 5):
+ if len(self.data) <= n:
+ n = len(self.data)
+ result = random.sample(self.data, n)
+ return [asdict(r) for r in result]
+
+ def head(self, n: int = 5):
+ return self.data[:n]
+
+ def __getitem__(self, index):
+ return self.data[index]
+
+ def __setitem__(self, index, value):
+ self.data[index] = value
+
+ def __delitem__(self, index):
+ del self.data[index]
+
+ def run_evaluation(
+ self,
+ completion_fn: Callable[[str], str] = None,
+ outputs: List[str] = None,
+ test_filename: str = None,
+ max_retries: int = 3,
+ min_success: int = 1,
+ metrics: List[BaseMetric] = None,
+ ) -> str:
+ """Run evaluation with given metrics"""
+ if completion_fn is None:
+ assert outputs is not None
+
+ table: List[List[Any]] = []
+
+ headers: List[str] = [
+ "Test Passed",
+ "Metric Name",
+ "Score",
+ "Output",
+ "Expected output",
+ "Message",
+ ]
+ results = run_test(
+ test_cases=self.data,
+ metrics=metrics,
+ raise_error=True,
+ max_retries=max_retries,
+ min_success=min_success,
+ )
+ for result in results:
+ table.append(
+ [
+ result.success,
+ result.metric_name,
+ result.score,
+ result.output,
+ result.expected_output,
+ "",
+ ]
+ )
+ if test_filename is None:
+ test_filename = (
+ f"test-result-{datetime.now().__str__().replace(' ', '-')}.txt"
+ )
+ with open(test_filename, "w") as f:
+ f.write(tabulate(table, headers=headers))
+ print(f"Saved to {test_filename}")
+ for t in table:
+ assert t[0] == True, t[-1]
+ return test_filename
+
+ def review(self):
+ """A bulk editor for reviewing synthetic data."""
+ try:
+ from dash import (
+ Dash,
+ Input,
+ Output,
+ State,
+ callback,
+ dash_table,
+ dcc,
+ html,
+ )
+ except ModuleNotFoundError:
+ raise Exception(
+ """You will need to run `pip install dash` to be able to review tests that were automatically created."""
+ )
+
+ table_data = [
+ {"input": x.query, "expected_output": x.expected_output}
+ for x in self.data
+ ]
+ app = Dash(
+ __name__,
+ external_stylesheets=[
+ "https://cdn.jsdelivr.net/npm/bootswatch@5.3.1/dist/darkly/bootstrap.min.css"
+ ],
+ )
+
+ app.layout = html.Div(
+ [
+ html.H1("Bulk Review Test Cases", style={"marginLeft": "20px"}),
+ html.Button(
+ "Add Test case",
+ id="editing-rows-button",
+ n_clicks=0,
+ style={
+ "padding": "8px",
+ "backgroundColor": "purple", # Added purple background color
+ "color": "white",
+ "border": "2px solid purple", # Added purple border
+ "marginLeft": "20px",
+ },
+ ),
+ html.Div(
+ dash_table.DataTable(
+ id="adding-rows-table",
+ columns=[
+ {
+ "name": c.title().replace("_", " "),
+ "id": c,
+ "deletable": True,
+ "renamable": True,
+ }
+ for i, c in enumerate(["input", "expected_output"])
+ ],
+ data=table_data,
+ editable=True,
+ row_deletable=True,
+ style_data_conditional=[
+ {
+ "if": {"row_index": "odd"},
+ "backgroundColor": "rgb(40, 40, 40)",
+ "color": "white",
+ },
+ {
+ "if": {"row_index": "even"},
+ "backgroundColor": "rgb(30, 30, 30)",
+ "color": "white",
+ },
+ {
+ "if": {"state": "selected"},
+ "backgroundColor": "white",
+ "color": "white",
+ },
+ ],
+ style_header={
+ "backgroundColor": "rgb(30, 30, 30)",
+ "color": "white",
+ "fontWeight": "bold",
+ "padding": "10px", # Added padding
+ },
+ style_cell={
+ "padding": "10px", # Added padding
+ "whiteSpace": "pre-wrap", # Wrap cell contents
+ "maxHeight": "200px",
+ },
+ ),
+ style={"padding": "20px"}, # Added padding
+ ),
+ html.Div(style={"margin-top": "20px"}),
+ html.Button(
+ "Save To CSV",
+ id="save-button",
+ n_clicks=0,
+ style={
+ "padding": "8px",
+ "backgroundColor": "purple", # Added purple background color
+ "color": "white",
+ "border": "2px solid purple", # Added purple border
+ "marginLeft": "20px",
+ },
+ ),
+ dcc.Input(
+ id="filename-input",
+ type="text",
+ placeholder="Enter filename (.csv format)",
+ style={
+ "padding": "8px",
+ "backgroundColor": "rgb(30, 30, 30)",
+ "color": "white",
+ "marginLeft": "20px",
+ "border": "2px solid purple", # Added purple border
+ "width": "200px", # Edited width
+ },
+ value="review-test.csv",
+ ),
+ html.Div(id="code-output"),
+ ],
+ style={"padding": "20px"}, # Added padding
+ )
+
+ @callback(
+ Output("adding-rows-table", "data"),
+ Input("editing-rows-button", "n_clicks"),
+ State("adding-rows-table", "data"),
+ State("adding-rows-table", "columns"),
+ )
+ def add_row(n_clicks, rows, columns):
+ if n_clicks > 0:
+ rows.append({c["id"]: "" for c in columns})
+ return rows
+
+ @callback(
+ Output("save-button", "n_clicks"),
+ Input("save-button", "n_clicks"),
+ State("adding-rows-table", "data"),
+ State("adding-rows-table", "columns"),
+ State("filename-input", "value"),
+ )
+ def save_data(n_clicks, rows, columns, filename):
+ if n_clicks > 0:
+ import csv
+
+ with open(filename, "w", newline="") as f:
+ writer = csv.DictWriter(
+ f, fieldnames=[c["id"] for c in columns]
+ )
+ writer.writeheader()
+ writer.writerows(rows)
+ return n_clicks
+
+ @app.callback(
+ Output("code-output", "children"),
+ Input("save-button", "n_clicks"),
+ State("filename-input", "value"),
+ )
+ def show_code(n_clicks, filename):
+ if n_clicks > 0:
+ code = f"""
+ from deepeval.dataset import EvaluationDataset
+
+ # Replace 'filename.csv' with the actual filename
+ ds = EvaluationDataset.from_csv('{filename}')
+
+ # Access the data in the CSV file
+ # For example, you can print a few rows
+ print(ds.sample())
+ """
+ return html.Div(
+ [
+ html.P(
+ "Code to load the CSV file back into a dataset for testing:"
+ ),
+ html.Pre(code, className="language-python"),
+ ],
+ style={"padding": "20px"}, # Added padding
+ )
+ else:
+ return ""
+
+ app.run(debug=False)
+
+ def add_evaluation_query_answer_pairs(
+ self,
+ openai_api_key: str,
+ context: str,
+ n: int = 3,
+ model: str = "openai/gpt-3.5-turbo",
+ ):
+ """Utility function to create an evaluation dataset using ChatGPT."""
+ new_dataset = create_evaluation_query_answer_pairs(
+ openai_api_key=openai_api_key, context=context, n=n, model=model
+ )
+ self.data += new_dataset.data
+ print(f"Added {len(new_dataset.data)}!")
+
+
+def make_chat_completion_request(prompt: str, openai_api_key: str):
+ import openai
+
+ openai.api_key = openai_api_key
+ response = openai.ChatCompletion.create(
+ model="gpt-3.5-turbo",
+ messages=[
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": prompt},
+ ],
+ )
+ return response.choices[0].message.content
+
+
+def generate_chatgpt_output(prompt: str, openai_api_key: str) -> str:
+ max_retries = 3
+ retry_delay = 1
+ for attempt in range(max_retries):
+ try:
+ expected_output = make_chat_completion_request(
+ prompt=prompt, openai_api_key=openai_api_key
+ )
+ break
+ except Exception as e:
+ print(f"Error occurred: {e}")
+ if attempt < max_retries - 1:
+ print(f"Retrying in {retry_delay} seconds...")
+ time.sleep(retry_delay)
+ retry_delay *= 2
+ else:
+ raise
+
+ return expected_output
+
+
+def create_evaluation_query_answer_pairs(
+ openai_api_key: str,
+ context: str,
+ n: int = 3,
+ model: str = "openai/gpt-3.5-turbo",
+) -> EvaluationDataset:
+ """Utility function to create an evaluation dataset using GPT."""
+ prompt = f"""You are generating {n} sets of of query-answer pairs to create an evaluation dataset based on the below context.
+Context: {context}
+
+Respond in JSON format in 1 single line without white spaces an array of JSON with the keys `query` and `answer`. Do not use any other keys in the response.
+JSON:"""
+ for _ in range(3):
+ try:
+ responses = generate_chatgpt_output(
+ prompt, openai_api_key=openai_api_key
+ )
+ responses = json.loads(responses)
+ break
+ except Exception as e:
+ print(e)
+ return EvaluationDataset(test_cases=[])
+
+ test_cases = []
+ for response in responses:
+ test_case = LLMTestCase(
+ input=response["query"],
+ expected_output=response["answer"],
+ context=context,
+ # store this as None for now
+ actual_output="-",
+ )
+ test_cases.append(test_case)
+
+ dataset = EvaluationDataset(test_cases=test_cases)
+ return dataset
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 485f7b555..5d034ebd5 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -3,18 +3,12 @@
from rich import print
from typing import Optional, Any
from deepeval.constants import PYTEST_RUN_TEST_NAME
-from deepeval.test_run import TestRun, test_run_manager
+from deepeval.test_run import test_run_manager
def pytest_sessionstart(session: pytest.Session):
- test_run = TestRun(
- testFile=session.config.getoption("file_or_dir")[0],
- testCases=[],
- metricScores=[],
- configurations={},
- )
- test_run_manager.set_test_run(test_run)
- test_run_manager.save_test_run()
+ test_run_manager.save_to_disk = True
+ test_run_manager.create_test_run(session.config.getoption("file_or_dir")[0])
@pytest.hookimpl(tryfirst=True)
diff --git a/deepeval/templates.py b/deepeval/templates.py
index 0227cf30a..b3c387ea4 100644
--- a/deepeval/templates.py
+++ b/deepeval/templates.py
@@ -14,5 +14,9 @@
Given the evaluation steps, please evaluate the provided Text. Some fields in text might be unavailable and will be labelled "N/A". Return a `score` ranging from 0 - 5, with 5 being that it follows the criteria and 0 being that it does not. Be extra harsh and give as low a score as possible as it designed to penalize.
+**
+IMPORTANT: Please make sure to only score integer value between 0 - 5. No words or explaination is needed.
+**
+
score:
"""
diff --git a/deepeval/test_case.py b/deepeval/test_case.py
index aa2ff0dc9..fa107671a 100644
--- a/deepeval/test_case.py
+++ b/deepeval/test_case.py
@@ -12,7 +12,6 @@ class LLMTestCaseParams(Enum):
EXPECTED_OUTPUT = "expected_output"
CONTEXT = "context"
RETRIEVAL_CONTEXT = "retrieval_context"
- ID = "id"
@dataclass
diff --git a/deepeval/test_run.py b/deepeval/test_run.py
index 13719d69b..9bd532872 100644
--- a/deepeval/test_run.py
+++ b/deepeval/test_run.py
@@ -2,7 +2,7 @@
import json
from pydantic import BaseModel, Field
from typing import Any, Optional, List, Dict
-from deepeval.metrics.base_metric import BaseMetric
+from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
from collections import defaultdict
from deepeval.tracing import get_trace_stack
@@ -26,6 +26,7 @@ class MetricsMetadata(BaseModel):
metric: str
score: float
minimum_score: float = Field(None, alias="minimumScore")
+ reason: Optional[str] = None
class MetricScore(BaseModel):
@@ -62,15 +63,19 @@ def get_average_metric_score(self):
class MetricsMetadataAverageDict:
def __init__(self):
- self.metric_dict = defaultdict(list)
+ self.metric_scores_dict = defaultdict(list)
self.min_score_dict = defaultdict(float)
+ self.metric_reason_dict = defaultdict(str)
def add_metric(self, metric: BaseMetric):
- self.metric_dict[metric.__name__].append(metric.score)
- self.min_score_dict[metric.__name__] = min(
- self.min_score_dict.get(metric.__name__, float("inf")),
+ metric_name = metric.__name__
+
+ self.metric_scores_dict[metric_name].append(metric.score)
+ self.min_score_dict[metric_name] = min(
+ self.min_score_dict.get(metric_name, float("inf")),
metric.minimum_score,
)
+ self.metric_reason_dict[metric_name] = metric.reason
def get_metrics_metadata(self):
return [
@@ -78,8 +83,9 @@ def get_metrics_metadata(self):
metric=metric_name,
score=sum(scores) / len(scores),
minimumScore=self.min_score_dict[metric_name],
+ reason=self.metric_reason_dict[metric_name],
)
- for metric_name, scores in self.metric_dict.items()
+ for metric_name, scores in self.metric_scores_dict.items()
]
@@ -99,8 +105,7 @@ class APITestCase(BaseModel):
class TestRun(BaseModel):
test_file: Optional[str] = Field(
- # TODO: Fix test_file
- "test.py",
+ None,
alias="testFile",
)
dict_test_cases: Dict[int, APITestCase] = Field(
@@ -120,6 +125,7 @@ def add_llm_test_case(
test_case: LLMTestCase,
metrics: List[BaseMetric],
run_duration: float,
+ index: int,
):
# Check if test case with the same ID already exists
test_case_id = id(test_case)
@@ -145,17 +151,16 @@ def add_llm_test_case(
else:
# If it doesn't exist, create a new test case
# Adding backwards compatibility to ensure context still works.
- context = test_case.context
success = all([metric.is_successful() for metric in metrics])
api_test_case: APITestCase = APITestCase(
- name=os.getenv(PYTEST_RUN_TEST_NAME, "-"),
+ name=os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}"),
input=test_case.input,
actualOutput=test_case.actual_output,
expectedOutput=test_case.expected_output,
success=success,
metricsMetadata=metrics_metadata,
runDuration=run_duration,
- context=context,
+ context=test_case.context,
traceStack=get_trace_stack(),
)
@@ -191,34 +196,52 @@ class TestRunManager:
def __init__(self):
self.test_run = None
self.temp_file_name = TEMP_FILE_NAME
+ self.save_to_disk = False
- def set_test_run(self, test_run: "TestRun"):
+ def set_test_run(self, test_run: TestRun):
self.test_run = test_run
+ def create_test_run(self, file_name: Optional[str] = None):
+ test_run = TestRun(
+ testFile=file_name,
+ testCases=[],
+ metricScores=[],
+ configurations={},
+ )
+ self.set_test_run(test_run)
+
+ if self.save_to_disk:
+ self.save_test_run()
+
def get_test_run(self):
- try:
- with portalocker.Lock(
- self.temp_file_name, mode="r", timeout=5
- ) as file:
- self.test_run = self.test_run.load(file)
- except (FileNotFoundError, portalocker.exceptions.LockException):
- print("Error loading test run from disk", file=sys.stderr)
- self.test_run = None
+ if self.test_run is None or not self.save_to_disk:
+ self.create_test_run()
+
+ if self.save_to_disk:
+ try:
+ with portalocker.Lock(
+ self.temp_file_name, mode="r", timeout=5
+ ) as file:
+ self.test_run = self.test_run.load(file)
+ except (FileNotFoundError, portalocker.exceptions.LockException):
+ print("Error loading test run from disk", file=sys.stderr)
+ self.test_run = None
return self.test_run
def save_test_run(self):
- try:
- with portalocker.Lock(
- self.temp_file_name, mode="w", timeout=5
- ) as file:
- self.test_run = self.test_run.save(file)
- except portalocker.exceptions.LockException:
- print("Error saving test run to disk", file=sys.stderr)
+ if self.save_to_disk:
+ try:
+ with portalocker.Lock(
+ self.temp_file_name, mode="w", timeout=5
+ ) as file:
+ self.test_run = self.test_run.save(file)
+ except portalocker.exceptions.LockException:
+ print("Error saving test run to disk", file=sys.stderr)
def clear_test_run(self):
self.test_run = None
- def display_test_run(self, test_run: TestRun):
+ def display_results_table(self, test_run: TestRun):
# Calculate the average of each metric
metrics_avg = {
metric.metric: metric.score for metric in test_run.metric_scores
@@ -287,6 +310,8 @@ def display_test_run(self, test_run: TestRun):
def post_test_run(self, test_run: TestRun):
console = Console()
+
+ # TODO: change this, very hacky way to check if api key exists
if os.path.exists(".deepeval"):
try:
# make sure to exclude none for `context` to ensure it is handled properly
@@ -336,7 +361,7 @@ def save_test_run_locally(self):
print(f"Results saved in {local_folder} as {new_test_filename}")
os.remove(new_test_filename)
- def wrap_up_test_run(self):
+ def wrap_up_test_run(self, display_table: bool = True):
test_run = test_run_manager.get_test_run()
test_run.cleanup()
if test_run is None or len(test_run.test_cases) == 0:
@@ -344,7 +369,8 @@ def wrap_up_test_run(self):
delete_file_if_exists(test_run_manager.temp_file_name)
return
- self.display_test_run(test_run)
+ if display_table:
+ self.display_results_table(test_run)
self.post_test_run(test_run)
self.save_test_run_locally()
delete_file_if_exists(self.temp_file_name)
diff --git a/docs/docs/evaluation-datasets.mdx b/docs/docs/evaluation-datasets.mdx
index 66369cbdc..e8089c25d 100644
--- a/docs/docs/evaluation-datasets.mdx
+++ b/docs/docs/evaluation-datasets.mdx
@@ -6,24 +6,23 @@ sidebar_label: Datasets
## Quick Summary
-`deepeval` leverages decorators provided by Pytest to run test cases on entire evaluation datasets which are often prepared in the form of CSVs. However, your data does not have to strictly be in CSV format. To run test cases on your evaluation dataset, simply process your data structure into the data type of `Dataset`.
+In `deepeval`, am evaluation dataset, or just dataset, is a collection of `LLMTestCase`s. There are two approaches to evaluating datasets in `deepeval`:
-```python
-class DataPoint:
- input: str
- expected_output: Optional[str]
- context: Optional[List[str]]
+1. using `@pytest.mark.parametrize` and `assert_test`
+2. using `evaluate`
-class Dataset:
- data_points: List[DataPoint]
-```
+## Create An Evaluation Dataset
-For example, the following would consitute as a valid `Dataset`:
+Your original dataset can be in any format, such as CSV or JSON. Let's take JSON for example, the first step is to create a list of `LLMTestCase`s from your original dataset.
-```
-dataset = [
+```python
+from deepeval.test_case import LLMTestCase
+
+original_dataset = [
{
"input": "What are your operating hours?",
+ # Replace with your LLM application output
+ "actual_output": "..."
"context": [
"Our company operates from 10 AM to 6 PM, Monday to Friday.",
"We are closed on weekends and public holidays.",
@@ -32,37 +31,58 @@ dataset = [
},
{
"input": "Do you offer free shipping?",
+ # Replace with your LLM application output
+ "actual_output": "..."
"expected_output": "Yes, we offer free shipping on orders over $50."
},
{
"input": "What is your return policy?",
+ # Replace with your LLM application output
+ "actual_output": "..."
},
]
+
+dataset = []
+for datapoint in original_dataset:
+ input = datapoint.get("input", None)
+ actual_output = datapoint.get("actual_output", None)
+ expected_output = datapoint.get("expected_output", None)
+ context = datapoint.get("context", None)
+
+ test_case = LLMTestCase(
+ input=input,
+ actual_output=actual_output,
+ expected_output=expected_output,
+ context=context
+ )
+ dataset.append(test_case)
+```
+
+## Evaluate Your Dataset With Pytest
+
+Before we begin, we highly recommend [logging into Confident AI](https://app.confident-ai.com) to keep track of all evaluation results on the cloud:
+
+```console
+deepeval login
```
-Finally, utilize the `@pytest.mark.parametrize` to create test cases for each of the data points.
+`deepeval` utilizes the `@pytest.mark.parametrize` decorator to loop through entire datasets.
```python title="test_bulk.py"
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+from deepeval.metric.answer_relevancy import AnswerRelevancyMetric
+from deepeval.evaluate import assert_test
+
+dataset = [...]
+
@pytest.mark.parametrize(
"test_case",
dataset,
)
-def test_customer_chatbot(test_case: dict):
- input = test_case.get("input")
- expected_output = test_case.get("expected_output")
- context = test_case.get("context")
-
- # Replace with your LLM application
- actual_output = "..."
-
+def test_customer_chatbot(test_case: LLMTestCase):
factual_consistency_metric = FactualConsistencyMetric(minimum_score=0.3)
answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
- test_case = LLMTestCase(
- input=input,
- actual_output=actual_output,
- expected_output=expected_output,
- context=context
- )
assert_test(test_case, [factual_consistency_metric, answer_relevancy_metric])
```
@@ -72,6 +92,18 @@ To run several tests cases at once in parallel, use the optional `-n` flag follo
deepeval test run test_bulk.py -n 3
```
-## Create An Evaluation Dataset
+## Evaluate Your Dataset Without Pytest
+
+Alternately, you can use deepeval's `evaluate` function to evaluate datasets. This approach avoids the CLI, but does not allow for parallel test execution.
-_coming soon..._
+```python
+from deepeval.evaluator import evaluate
+from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+from deepeval.metric.answer_relevancy import AnswerRelevancyMetric
+
+dataset = [...]
+
+factual_consistency_metric = FactualConsistencyMetric(minimum_score=0.3)
+answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
+evaluate(dataset, [factual_consistency_metric, answer_relevancy_metric])
+```
diff --git a/docs/docs/evaluation-metrics.mdx b/docs/docs/evaluation-metrics.mdx
index b72c7cba0..70a5ab748 100644
--- a/docs/docs/evaluation-metrics.mdx
+++ b/docs/docs/evaluation-metrics.mdx
@@ -6,9 +6,7 @@ sidebar_label: Metrics
## Quick Summary
-In `deepeval`, a metric serves as a standard of measurement for evaluating the performance of an LLM output based on a specific criteria of interest. Essentially, while the metric acts as the ruler, the test case represents what you're assessing.
-
-`deepeval` offers a range of default metrics for you to quickly get started with, which includes:
+In `deepeval`, a metric serves as a standard of measurement for evaluating the performance of an LLM output based on a specific criteria of interest. Essentially, while the metric acts as the ruler, the test case represents what you're assessing. `deepeval` offers a range of default metrics for you to quickly get started with, which includes:
- Factual Consistency
- Answer Relevancy
@@ -17,9 +15,7 @@ In `deepeval`, a metric serves as a standard of measurement for evaluating the p
- Toxicity
- Bias
-`deepeval` also offers you a straightforward way to develop your own custom LLM-based evaluation metrics. **This is noteworthy because all default metrics in `deepeval` are derived from traditional NLP models, not LLMs.**
-
-Metrics are measured on a test case. As outlined in the [test cases section](evaluation-test-cases), you may be required to supply the optional `context` and `expected_output` arguments for your `LLMTestCase` depending on the metrics you're evaluating it on.
+`deepeval` also offers you a straightforward way to develop your own custom LLM-based evaluation metrics. This is noteworthy because all default metrics in `deepeval` are derived from traditional NLP models, not LLMs. All metrics are measured on a test case. Visit the [test cases section](evaluation-test-cases) to learn how to apply any metric on test cases for evaluation.
## Types of Metrics
@@ -42,7 +38,7 @@ A custom LLM evalated metric, is a custom metric whose evaluation is powered by
```python
from deepeval.metrics.llm_eval_metric import LLMEvalMetric
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
summarization_metric = LLMEvalMetric(
name="Summarization",
@@ -58,8 +54,8 @@ There are three mandatory and two optional parameters required when instantiatin
- `name`: name of metric
- `criteria`: a description outlining the specific evaluation aspects for each test case.
- `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
-- [Optional] `minimum_score`: the passing threshold
-- [Optional] `model`: the model name. This is defaulted to 'gpt-4' and we currently only support models from OpenAI.
+- [Optional] `minimum_score`: the passing threshold, defaulted to 0.5.
+- [Optional] `model`: the model name. This is defaulted to 'gpt-4-1106-preview' and we currently only support models from OpenAI.
All instances of `LLMEvalMetric` returns a score ranging from 0 - 1. A metric is only successful if the evaluation score is equal to or greater than `minimum_score`.
@@ -67,27 +63,82 @@ All instances of `LLMEvalMetric` returns a score ranging from 0 - 1. A metric is
For accurate and valid results, only the parameters that are mentioned in `criteria` should be included as a member of `evaluation_params`.
:::
-By defauly, `LLMEvalMetric` is evaluated using `GPT-4` from OpenAI.
+By default, `LLMEvalMetric` is evaluated using `GPT-4` from OpenAI. Azure OpenAI endpoints are also supported via LLMEvalMetric, the `deployment_id` parameter should be passed
+during instantiation of `LLMEvalMetric` class
+
+```python
+summarization_metric = LLMEvalMetric(
+ name="Summarization",
+ criteria="Summarization - determine if the actual output is an accurate and concise summarization of the input.",
+ evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
+ minimum_score=0.5,
+ deployment_id="your-deployment-id"
+)
+```
+
+:::note
+Don't forget to set your Azure OpenAI key
+```python
+openai.api_key = "your-Azure-OpenAI-API-key"
+```
+:::
+
+## JudgementalGPT
+
+`JudgementalGPT` is an LLM agent developed in-house by [Confident AI](https://confident-ai.com) that's dedicated to evaluation and is superior to `LLMEvalMetric`. While it operates similarly to `LLMEvalMetric` by utilizing LLMs for scoring, it:
+
+- offers enhanced accuracy and reliability.
+- is capable of generating justifications for its scores
+- has the ability to conditionally execute code that helps detect logical fallacies during evaluations
+
+To use `JudgementalGPT`, start by logging into Confident AI:
+
+```console
+deepeval login
+```
+
+Then paste in the following code to define a metric powered by `JudgementalGPT`:
+
+```python
+from deepeval.metrics.judgemental_gpt import JudgementalGPT
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+
+code_correctness_metric = JudgementalGPT(
+ name="Code Correctness",
+ criteria="Code Correctness - determine whether the python code in the 'actual output' produces a valid JSON.",
+ evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
+ minimum_score=0.5,
+)
+```
+
+Under the hood, `JudgementalGPT(...)` sends a request to Confident AI's servers that hosts `JudgementalGPT`. `JudgementalGPT` accepts four arguments:
+
+- `name`: name of metric
+- `criteria`: a description outlining the specific evaluation aspects for each test case.
+- `evaluation_params`: a list of type `LLMTestCaseParams`. Include only the parameters that are relevant for evaluation.
+- [Optional] `minimum_score`: the passing threshold, defaulted to 0.5.
-## Custom Classic Metrics
+## Custom Metrics
-A custom classic metric, is a metric not provided by `deepeval` and whose criteria isn't evaluated using an LLM.
+You can implement your own evaluator (for example, your own GPT evaluator) by creating a custom metric. All custom metrics are automatically integrated with Confident AI.
```python
-from deepeval.metrics.metric import Metric
+from deepeval.metrics.base_metric import BaseMetric
-class LengthMetric(Metric):
+# Inherit BaseMetric
+class LengthMetric(BaseMetric):
# This metric checks if the output length is greater than 10 characters
def __init__(self, max_length: int=10):
- self.max_length = max_length
+ self.minimum_score = max_length
def measure(self, test_case: LLMTestCase):
- self.success = len(test_case.actual_output) > self.max_length
+ # Set self.success and self.score in the "measure" method
+ self.success = len(test_case.actual_output) > self.minimum_score
if self.success:
- score = 1
+ self.score = 1
else:
- score = 0
- return score
+ self.score = 0
+ return self.score
def is_successful(self):
return self.success
@@ -206,7 +257,7 @@ from deepeval.test_case import LLMTestCase
input = "What if these shoes don't fit?"
context = ["All customers are eligible for a 30 day full refund at no extra cost."]
-expected_output = "You're eligible for a 30 day refund at no extra cost.",
+expected_output = "You're eligible for a 30 day refund at no extra cost."
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra cost."
diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx
index d6cc92a74..f98eb6425 100644
--- a/docs/docs/evaluation-test-cases.mdx
+++ b/docs/docs/evaluation-test-cases.mdx
@@ -174,17 +174,59 @@ test_case = LLMTestCase(
Remember, `context` is the ideal retrieval results for a given input and typically come from your evaluation dataset, whereas `retrieval_context` is your LLM application's actual retrieval results.
:::
-## Assert A Test Case
+## Run A Test Case
+
+`deepeval` offers an option to quickly run a test case without going through the CLI.
+
+```python
+# A hypothetical LLM application example
+import chatbot
+from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+from deepeval.evaluator import run_test
+from deepeval.test_case import LLMTestCase
+
+prompt_template = """
+ Impersonate a dog named Rocky when replying to the text below.
-Similar to Pytest, `deepeval` allows you to assert any test case you create by calling the `assert_test` function by running `deepeval test run` via the CLI.
+ {text}
+"""
-`assert_test` takes two mandatory arguments: `test_case` and a list of `metrics`. A test case passes only if all metrics meet their respective evaluation criterion. Depending on the metric, a combination of `input`, `actual_output`, `expected_output`, and `context` is used to ascertain whether their criterion have been met.
+prompt = prompt_template.format(text="Who's a good boy?")
+context = ["Rocky is a good boy."]
+
+test_case = LLMTestCase(
+ input=prompt,
+ # Replace this with your actual LLM application
+ actual_output=chatbot.run(prompt),
+ expected_output="Me, ruff!",
+ context=context
+)
+
+metric = FactualConsistencyMetric(minimum_score=0.7)
+run_test(test_case, [metric])
+```
+
+## Assert Test Cases
+
+Before we begin going through the final sections, we highly recommend you to login to [Confident AI](https://confident-ai.com) (the platform powering deepeval) via the CLI. This way, you can keep track of all evaluation results generated each time you execute `deepeval test run`.
+
+```
+deepeval login
+```
+
+Similar to Pytest, `deepeval` allows you to assert any test case you create by calling the `assert_test` function by running `deepeval test run` via the CLI. `assert_test` takes two mandatory arguments:
+
+- `test_case`: an `LLMTestCase`
+- `metrics`: a list of metrics
+
+A test case passes only if all metrics meet their respective evaluation criterion. Depending on the metric, a combination of `input`, `actual_output`, `expected_output`, and `context` is used to ascertain whether their criterion have been met.
```python title="test_assert_example.py"
# A hypothetical LLM application example
import chatbot
from deepeval.metrics.factual_consistency import FactualConsistencyMetric
from deepeval.evaluator import assert_test
+from deepeval.test_case import LLMTestCase
prompt_template = """
Impersonate a dog named Rocky when replying to the text below.
@@ -192,7 +234,7 @@ prompt_template = """
{text}
"""
-def test_case():
+def test_case_1():
prompt = prompt_template.format(text="Who's a good boy?")
context = ["Rocky is a good boy."]
@@ -205,37 +247,49 @@ def test_case():
)
metric = FactualConsistencyMetric(minimum_score=0.7)
assert_test(test_case, metrics=[metric])
-```
-:::warning
-Typically, the `prompt_template` is implemented within your LLM application (ie. somewhere in our hypothetical `chatbot.run()` method), but from a visibility perspective we've made the `prompt_template` explicit.
-:::
+def test_case_2():
+ prompt = prompt_template.format(text="Who's a good boy?")
+ context = ["Rocky is a good boy."]
-In the CLI, run `deepeval test run`. You can also include an optional `-n` flag follow by a number (that determines the number of processes that will be used) to run tests in parallel.
+ test_case = LLMTestCase(
+ input=prompt,
+ # Replace this with your actual LLM application
+ actual_output=chatbot.run(prompt),
+ expected_output="Me, ruff!",
+ context=context
+ )
+ metric = FactualConsistencyMetric(minimum_score=0.7)
+ assert_test(test_case, metrics=[metric])
+```
+
+In the CLI, run the command `deepeval test run`, which uses Pytest under the hood. You can also include an optional `-n` flag follow by a number (that determines the number of processes that will be used) to run tests in parallel.
```console
deepeval test run test_assert_example.py -n 4
```
-We also highly recommend you to login to **[Confident AI](https://confident-ai.com)** (the platform powering deepeval) via the CLI. This way, you can keep track of all evaluation results generated each time you execute `deepeval test run`. You can also export completed evaluation datasets generated from your test file, view average metric scores for each test run, and much more.
+## Evaluate Test Cases in Bulk
-```
-deepeval login
-```
+Lastly, `deepeval` offers an `evaluate` function to evaluate multiple test cases at once, which similar to assert_test but without needing pytest or the CLI.
-Run `deepeval test run test_assert_example.py` in the CLI again to start evaluating results on the web.
-
-## Run A Test Case
+```python
+# A hypothetical LLM application example
+import chatbot
+from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+from deepeval.evaluator import evaluate
+from deepeval.test_case import LLMTestCase
-`deepeval` also offers an option to quickly run test cases without going through the CLI or creating a test file.
+prompt_template = """
+ Impersonate a dog named Rocky when replying to the text below.
-```python
-from deepeval.evaluator import run_test
+ {text}
+"""
prompt = prompt_template.format(text="Who's a good boy?")
context = ["Rocky is a good boy."]
-test_case = LLMTestCase(
+first_test_case = LLMTestCase(
input=prompt,
# Replace this with your actual LLM application
actual_output=chatbot.run(prompt),
@@ -243,6 +297,18 @@ test_case = LLMTestCase(
context=context
)
+second_test_case = LLMTestCase(
+ input=prompt,
+ # Replace this with your actual LLM application
+ actual_output=chatbot.run(prompt),
+ expected_output="Me, ruff!",
+ context=context
+)
+
+dataset = [first_test_case, second_test_case]
+
metric = FactualConsistencyMetric(minimum_score=0.7)
-run_test(test_case, [metric])
+evaluate(dataset, [metric])
```
+
+Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evalute`, visit the [datasets section](evaluation-datasets).
diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx
index 5c1116020..018056de3 100644
--- a/docs/docs/getting-started.mdx
+++ b/docs/docs/getting-started.mdx
@@ -10,9 +10,8 @@ import Envelope from "@site/src/components/envelope";
and iterate on LLM applications with the following principles in mind:
- Easily "unit test" LLM outputs in a similar way to Pytest.
+- Leverage various out-of-the-box LLM-evaluated and classic evaluation metrics.
- Define evaluation datasets in Python code.
-- Switch between LLM-evaluated and classic evaluation metrics.
-- Provide optimal models for all evaluation metrics (with a continuous open-science initiative to advance evaluation techniques)
- Metrics are simple to customize.
- [alpha] Bring evaluation into production using Python decorators.
@@ -35,14 +34,12 @@ In your newly created virtual environement, run:
pip install -U deepeval
```
-The command above will install the latest version of `deepeval` into your project's virtual environement.
+You can also keep track of all evaluation results by logging into our [in all one evaluation platform](https://confident-ai.com), and use Confident AI's [proprietary LLM evaluation agent](evaluation-metrics#judgementalgpt) for evaluation:
```console
deepeval login
```
-Although highly recommended, you can also optionally keep track of all evaluation results by logging into our [hosted platform](https://confident-ai.com).
-
:::note
**[Contact us](https://calendly.com/jeffreyip-cno/sales-call)** if you're dealing with sensitive data that has to reside in your private VPCs.
:::
@@ -301,13 +298,13 @@ deepeval test run test_example.py
You should now see a link being returned upon test completion. Paste it in your browser to view results.
-![ok](https://d2lsxfc3p6r9rv.cloudfront.net/dashboard.png)
+![ok](https://d2lsxfc3p6r9rv.cloudfront.net/test-summary.png)
### View Individual Test Cases
You can also view individual test cases for enhanced debugging:
-![ok](https://d2lsxfc3p6r9rv.cloudfront.net/dashboard2.png)
+![ok](https://d2lsxfc3p6r9rv.cloudfront.net/test-cases.png)
### Compare Hyperparameters
diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js
index 038ae7d16..fb2ab075c 100644
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -2,7 +2,7 @@
const config = {
title: 'DeepEval',
tagline: 'Evaluation Framework for LLMs',
- favicon: 'static/img/favicon.ico',
+ favicon: 'img/fav.ico',
// Set the production url of your site here
url: 'https://docs.confident-ai.com',
diff --git a/docs/static/img/fav.ico b/docs/static/img/fav.ico
new file mode 100644
index 000000000..8ea4129d9
Binary files /dev/null and b/docs/static/img/fav.ico differ
diff --git a/docs/static/img/favicon.ico b/docs/static/img/favicon.ico
deleted file mode 100644
index 7a7ed8420..000000000
Binary files a/docs/static/img/favicon.ico and /dev/null differ
diff --git a/examples/getting_started/test_example.py b/examples/getting_started/test_example.py
index 429088356..642520829 100644
--- a/examples/getting_started/test_example.py
+++ b/examples/getting_started/test_example.py
@@ -6,12 +6,14 @@
from deepeval.metrics.base_metric import BaseMetric
import deepeval
+# To run this file: deepeval test run .py
+
def test_factual_consistency():
input = "What if these shoes don't fit?"
- context = (
+ context = [
"All customers are eligible for a 30 day full refund at no extra cost."
- )
+ ]
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra cost."
@@ -49,10 +51,10 @@ def __init__(self, max_length: int = 10):
def measure(self, test_case: LLMTestCase):
self.success = len(test_case.actual_output) > self.minimum_score
if self.success:
- score = 1
+ self.score = 1
else:
- score = 0
- return score
+ self.score = 0
+ return self.score
def is_successful(self):
return self.success
@@ -74,9 +76,9 @@ def test_length():
def test_everything():
input = "What if these shoes don't fit?"
- context = (
+ context = [
"All customers are eligible for a 30 day full refund at no extra cost."
- )
+ ]
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra cost."
diff --git a/pyproject.toml b/pyproject.toml
index ecd09309f..5bcbcc436 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "deepeval"
-version = "0.20.17"
+version = "0.20.19"
description = "The Evaluation Framework for LLMs"
authors = ["Jeffrey Ip "]
license = "Apache-2.0"
diff --git a/setup.py b/setup.py
index 9737eefea..df15c5459 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,8 @@
"pandas",
"pydantic", # loosen pydantic requirements as we support multiple
"sentry-sdk",
+ "pytest-xdist",
+ "portalocker",
],
extras_require={
"bias": [
diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py
index 8e5ea8493..9ff598fca 100644
--- a/tests/test_answer_relevancy.py
+++ b/tests/test_answer_relevancy.py
@@ -2,31 +2,14 @@
"""
import pytest
from deepeval.test_case import LLMTestCase
-from deepeval.metrics.answer_relevancy import (
- AnswerRelevancyMetric,
- assert_answer_relevancy,
-)
-from deepeval.metrics.answer_relevancy import is_answer_relevant
-from deepeval.evaluator import run_test, assert_test
-from .utils import assert_viable_score
+from deepeval.metrics import AnswerRelevancyMetric
+
+from deepeval.evaluator import assert_test, run_test
query = "What is Python?"
answer = "Python is a programming language?"
-def test_answer_relevancy():
- assert_answer_relevancy(query, answer, minimum_score=0.5)
-
-
-def test_answer_relevancy():
- assert is_answer_relevant(query, answer, minimum_score=0.5)
-
-
-def test_answer_not_relevant():
- with pytest.raises(AssertionError):
- assert_answer_relevancy(query, "He is not your friend")
-
-
def test_query_answer_relevancy():
scorer = AnswerRelevancyMetric(minimum_score=0.5)
test_case = LLMTestCase(input=query, actual_output=answer)
@@ -36,28 +19,21 @@ def test_query_answer_relevancy():
def test_compare_answer_relevancy_2():
scorer = AnswerRelevancyMetric(minimum_score=0.5)
test_case = LLMTestCase(input=query, actual_output="Programming lang")
- test_case_2 = LLMTestCase(
- input=query, actual_output="Python is a programming lang"
- )
- results = run_test([test_case, test_case_2], metrics=[scorer])
- assert results[1].score > results[0].score
+ run_test(test_case, metrics=[scorer])
def test_compare_answer_relevancy():
metric = AnswerRelevancyMetric(minimum_score=0.5)
query = "what is python"
- test_case = LLMTestCase(input=query, actual_output="Programming lang")
test_case_2 = LLMTestCase(
input=query, actual_output="Python is a programming lang"
)
- result = run_test([test_case, test_case_2], metrics=[metric])
- assert result[1].score > result[0].score
+ run_test(test_case_2, metrics=[metric])
def test_cross_encoder_answer_relevancy():
- scorer = AnswerRelevancyMetric(
+ metric = AnswerRelevancyMetric(
minimum_score=0.5, model_type="cross_encoder"
)
test_case = LLMTestCase(input=query, actual_output=answer)
- score = assert_test(test_case, [scorer])
- assert_viable_score(score[0].score)
+ assert_test(test_case, [metric])
diff --git a/tests/test_bias.py b/tests/test_bias.py
index 67a7a0162..861ce4ebf 100644
--- a/tests/test_bias.py
+++ b/tests/test_bias.py
@@ -4,7 +4,7 @@
import pytest
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics.bias_classifier import UnBiasedMetric, assert_unbiased
+from deepeval.metrics.bias_classifier import UnBiasedMetric
from deepeval.evaluator import assert_test
output = "Winners of the FIFA world cup were the French national football team"
@@ -14,7 +14,7 @@
def test_non_toxic():
assert_test(
- test_cases=LLMTestCase(
+ test_case=LLMTestCase(
input="placeholder",
actual_output=output,
),
@@ -22,15 +22,6 @@ def test_non_toxic():
)
-def test_non_toxic_2():
- with pytest.raises(AssertionError):
- assert_unbiased(
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
- input="placeholder",
- actual_output="Left wing is evil.",
- )
-
-
def test_non_toxic_metric():
metric = UnBiasedMetric(evaluation_params=[LLMTestCaseParams.INPUT])
test_case = LLMTestCase(input="placeholder", actual_output=output)
diff --git a/tests/test_chatbot_example.py b/tests/test_chatbot_example.py
index cc08217ad..ee9d50d0c 100644
--- a/tests/test_chatbot_example.py
+++ b/tests/test_chatbot_example.py
@@ -1,5 +1,5 @@
import pytest
-from deepeval.metrics.factual_consistency import FactualConsistencyMetric
+from deepeval.metrics import FactualConsistencyMetric
from deepeval.test_case import LLMTestCase
from deepeval.evaluator import assert_test
diff --git a/tests/test_custom_metric.py b/tests/test_custom_metric.py
index 46a086610..7148bb36f 100644
--- a/tests/test_custom_metric.py
+++ b/tests/test_custom_metric.py
@@ -2,7 +2,7 @@
"""
from deepeval.test_case import LLMTestCase
-from deepeval.metrics.base_metric import BaseMetric
+from deepeval.metrics import BaseMetric
from deepeval.evaluator import assert_test
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 904bafef6..8c300664d 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -1,6 +1,6 @@
-import os
+# import os
-import pytest
+# import pytest
# def test_evaluation_dataset():
@@ -26,69 +26,3 @@
# id_column="id",
# )
# assert len(dataset) == 5
-
-
-# @pytest.mark.skip(reason="OpenAI costs")
-# def test_create_synthetic_dataset():
-# """
-# test for creating a synthetic dataset
-# """
-# from deepeval.dataset import create_evaluation_query_answer_pairs
-
-# dataset = create_evaluation_query_answer_pairs(
-# openai_api_key=os.environ["OPENAI_API_KEY"],
-# context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
-# n=1,
-# )
-# assert len(dataset) == 1
-
-
-# def test_dataset_evaluation():
-# """
-# Test dataset evaluation
-# """
-# from deepeval.dataset import EvaluationDataset
-# from deepeval.metrics.conceptual_similarity import (
-# ConceptualSimilarityMetric,
-# )
-
-# csv_filename = "sample.csv"
-
-# csv_file = """id,query,expected_output,output
-# 1,"Hello, world!","This is a greeting.","This is a friendly greeting."
-# 2,"OpenAI GPT-3","A powerful language model.","A powerful AI language model."
-# 3,"CSV Example","Working with CSV data.","Working with data in CSV format."
-# 4,"Python Programming","Coding in Python.","Writing code in Python."
-# 5,"Data Science","Analyzing data.","Exploring and analyzing data."
-# """
-
-# with open(csv_filename, "w") as file:
-# file.write(csv_file)
-
-# dataset: EvaluationDataset = EvaluationDataset.from_csv(
-# csv_filename,
-# query_column="query",
-# expected_output_column="expected_output",
-# id_column="id",
-# output_column="output",
-# )
-# metric = ConceptualSimilarityMetric()
-# result = dataset.run_evaluation(
-# outputs="output",
-# metrics=[metric],
-# )
-
-
-# @pytest.mark.skip(reason="OpenAI costs")
-# def test_create_evaluation_query_answer_pairs():
-# """
-# test for creating a synthetic dataset
-# """
-# from deepeval.dataset import create_evaluation_query_answer_pairs
-
-# dataset = create_evaluation_query_answer_pairs(
-# openai_api_key=os.environ["OPENAI_API_KEY"],
-# context="FastAPI is a modern, fast (high-performance), web framework for building APIs with Python 3.7+ based on standard Python type hints.",
-# n=10,
-# )
-# assert len(dataset) == 10
diff --git a/tests/test_factual_consistency.py b/tests/test_factual_consistency.py
index 313796f2a..6654ef6ed 100644
--- a/tests/test_factual_consistency.py
+++ b/tests/test_factual_consistency.py
@@ -1,31 +1,8 @@
import pytest
from deepeval.test_case import LLMTestCase
-from deepeval.metrics.factual_consistency import (
- FactualConsistencyMetric,
- assert_factual_consistency,
-)
-from deepeval.evaluator import assert_test
-
-
-def test_factual_consistency():
- # legacy functions - consider removing
- with pytest.raises(AssertionError):
- assert_factual_consistency(
- context=[
- "After a long day at work, Sarah decided to go for a walk in the park to unwind. She put on her sneakers and grabbed her headphones before heading out. As she strolled along the path, she noticed families having picnics, children playing on the playground, and ducks swimming in the pond."
- ],
- output="Sarah spent the evening at the library, engrossed in a book.",
- )
-
+from deepeval.metrics import FactualConsistencyMetric
-def test_factual_consistency_2():
- # legacy functions - consider removing
- assert_factual_consistency(
- context=[
- "After a long day at work, Sarah decided to go for a walk in the park to unwind. She put on her sneakers and grabbed her headphones before heading out. As she strolled along the path, she noticed families having picnics, children playing on the playground, and ducks swimming in the pond."
- ],
- output="Sarah went out for a walk in the park.",
- )
+from deepeval.evaluator import assert_test
def test_factual_consistency_metric():
diff --git a/tests/test_llm_metric.py b/tests/test_llm_metric.py
index 7a2714899..12973d488 100644
--- a/tests/test_llm_metric.py
+++ b/tests/test_llm_metric.py
@@ -1,8 +1,19 @@
import pytest
+import openai
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics.llm_eval_metric import LLMEvalMetric
+from deepeval.metrics import LLMEvalMetric
from deepeval.evaluator import assert_test
+# set openai api type
+openai.api_type = "azure"
+
+# The azure openai version you want to use
+openai.api_version = "2023-03-15"
+
+# The base URL for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource.
+openai.api_base = "https://your-resource-name.openai.azure.com/"
+openai.api_key = ""
+
def test_chat_completion():
"""Test Chat Completion"""
@@ -23,3 +34,25 @@ def test_chat_completion():
)
assert_test(test_case, [metric])
+
+
+def test_azure_openai_chat_completion():
+ """Test Chat Completion"""
+ metric = LLMEvalMetric(
+ name="Validity",
+ criteria="The response is a valid response to the prompt.",
+ minimum_score=0.5,
+ evaluation_params=[
+ LLMTestCaseParams.INPUT,
+ LLMTestCaseParams.ACTUAL_OUTPUT,
+ ],
+ deployment_id="your-deployment-id",
+ )
+ test_case = LLMTestCase(
+ input="What is the capital of France?",
+ actual_output="Paris",
+ expected_output="Paris",
+ context=["Geography"],
+ )
+
+ assert_test(test_case, [metric])
diff --git a/tests/test_overall_score.py b/tests/test_overall_score.py
deleted file mode 100644
index 5a95e488e..000000000
--- a/tests/test_overall_score.py
+++ /dev/null
@@ -1,127 +0,0 @@
-"""Test alert score
-"""
-import os
-
-import pytest
-from deepeval.test_case import LLMTestCase
-from deepeval.api import Api
-from deepeval.metrics.overall_score import (
- OverallScoreMetric,
- assert_overall_score,
-)
-from deepeval.evaluator import assert_test, run_test
-
-from .utils import assert_viable_score
-
-TEST_API_KEY = "u1s5aFlB6kRyVz/16CZuc7JOQ7e7sCw00N7nfeMZOrk="
-os.environ["CONFIDENT_AI_API_KEY"] = TEST_API_KEY
-
-query = "Who won the FIFA World Cup in 2018?"
-output = "Winners of the FIFA world cup were the French national football team"
-expected_output = "French national football team"
-context = "The FIFA World Cup in 2018 was won by the French national football team. They defeated Croatia 4-2 in the final match to claim the championship."
-
-client = Api(api_key=TEST_API_KEY)
-
-metric = OverallScoreMetric()
-
-
-class TestOverallScore:
- metric = OverallScoreMetric()
-
- def test_overall_score(self):
- os.environ["CONFIDENT_AI_API_KEY"] = TEST_API_KEY
- assert_overall_score(
- query=query,
- output=output,
- expected_output=expected_output,
- context=context,
- )
-
- def test_overall_score_worst_context(self):
- test_case = LLMTestCase(
- input=query,
- actual_output=output,
- expected_output=expected_output,
- context=["He doesn't know how to code"],
- )
- test_case_2 = LLMTestCase(
- input=query,
- actual_output=output,
- expected_output=expected_output,
- context=context,
- )
- scores = run_test([test_case, test_case_2], metrics=[self.metric])
- assert scores[1].score > scores[0].score, "Failed the test"
-
- def test_overall_score_worst_output(self):
- test_case = LLMTestCase(
- input=query,
- actual_output="Not relevant",
- expected_output=expected_output,
- context=["He doesn't know how to code"],
- )
- test_case_2 = LLMTestCase(
- input=query,
- actual_output=output,
- expected_output=expected_output,
- context=["He doesn't know how to code"],
- )
- scores = run_test([test_case, test_case_2], metrics=[self.metric])
- assert scores[0] > scores[1]
-
- def test_worst_expected_output(self):
- test_case = LLMTestCase(
- input=query,
- actual_output="Not relevant",
- expected_output="STranger things",
- context=["He doesn't know how to code"],
- )
- score_4 = self.metric.measure(test_case)
- test_case_2 = LLMTestCase(
- input=query,
- actual_output="Not relevant",
- expected_output=expected_output,
- context=["He doesn't know how to code"],
- )
- scores = run_test([test_case, test_case_2], metrics=[self.metric])
- assert scores[0] > scores[1]
-
- def test_overall_score_metric(self):
- test_case = LLMTestCase(
- input=query,
- actual_output=output,
- expected_output=expected_output,
- context=context,
- )
- result = run_test([test_case], metrics=[self.metric])
- assert_viable_score(result[0].score)
-
- def test_overall_score_metric_no_query(self):
- test_case = LLMTestCase(
- input="placeholder",
- actual_output=output,
- expected_output=expected_output,
- context=context,
- )
- assert_test([test_case], metrics=[self.metric])
-
- def test_overall_score_metric_no_query_no_context(self):
- test_case = LLMTestCase(
- input="placeholder",
- actual_output=output,
- expected_output=expected_output,
- )
- result = run_test([test_case], metrics=[self.metric])
- assert result[0].success, "Overall score metric not working"
- assert_viable_score(result[0].score)
-
- def test_overall_score_metric_no_context_no_expected_output(self):
- test_case = LLMTestCase(
- input=query,
- actual_output=output,
- )
- score = self.metric.measure(test_case)
- result = run_test([test_case], metrics=[self.metric])
- assert result[0].success, "Overall score metric not working"
- assert_viable_score(result[0].score)
diff --git a/tests/test_quickstart.py b/tests/test_quickstart.py
index 0ff311ab8..91ed763f8 100644
--- a/tests/test_quickstart.py
+++ b/tests/test_quickstart.py
@@ -2,8 +2,8 @@
"""
import pytest
-from deepeval.metrics.factual_consistency import assert_factual_consistency
-from deepeval.metrics.overall_score import OverallScoreMetric
+from deepeval.metrics import FactualConsistencyMetric
+
from deepeval.test_case import LLMTestCase
from deepeval.evaluator import assert_test
@@ -16,15 +16,20 @@ def generate_llm_output(query: str):
def test_llm_output():
input = "What is the customer success phone line?"
context = ["Our customer success phone line is 1200-231-231."]
- output = generate_llm_output(input)
- assert_factual_consistency(output, context)
+ test_case = LLMTestCase(
+ input=input, actual_output=generate_llm_output(input), context=context
+ )
+ assert_test(test_case, [FactualConsistencyMetric(minimum_score=0.5)])
def test_llm_output_custom():
actual_output = "Dogs and cats hate to walk around the beach."
context = ["Dogs and cats love to walk around the beach."]
+ test_case = LLMTestCase(
+ input="Placerholder", actual_output=actual_output, context=context
+ )
with pytest.raises(AssertionError):
- assert_factual_consistency(actual_output, context)
+ assert_test(test_case, [FactualConsistencyMetric(minimum_score=0.5)])
def test_0():
@@ -39,5 +44,6 @@ def test_0():
expected_output=expected_output,
context=context,
)
- metric = OverallScoreMetric()
- assert_test(test_case, metrics=[metric])
+ metric = FactualConsistencyMetric()
+ with pytest.raises(AssertionError):
+ assert_test(test_case, metrics=[metric])
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index c4fbfb972..1260af34e 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -1,6 +1,6 @@
import pytest
from deepeval.test_case import LLMTestCase
-from deepeval.metrics.ragas_metric import RagasMetric
+from deepeval.metrics import RagasMetric
from deepeval.evaluator import assert_test
query = "Who won the FIFA World Cup in 2018?"
diff --git a/tests/test_scoring.py b/tests/test_scoring.py
index 052eea9b9..a381eb90b 100644
--- a/tests/test_scoring.py
+++ b/tests/test_scoring.py
@@ -181,3 +181,37 @@ def test_faithfulness_score_different_strings(self):
prediction = "A fast brown fox leaped over the sleeping dog."
# When the prediction is different from the target, the faithfulness score should be less than 1.0.
self.assertLess(Scorer.faithfulness_score(target, prediction), 1.0)
+
+ # Tests for toxicity scores
+ # Todo: add more different tests for dict and different types of models.
+
+ def test_neural_toxic_score_original_model(self):
+ prediction = "This is a non-toxic text."
+ mean_toxicity_score, _ = Scorer.neural_toxic_score(
+ prediction, model="original"
+ )
+ self.assertTrue(0 <= mean_toxicity_score <= 1)
+
+ def test_neural_toxic_score_unbiased_model(self):
+ prediction = "This is a non-toxic text."
+ mean_toxicity_score, _ = Scorer.neural_toxic_score(
+ prediction, model="unbiased"
+ )
+ self.assertTrue(0 <= mean_toxicity_score <= 1)
+
+ def test_neural_toxic_score_multilingual_model(self):
+ prediction = "This is a non-toxic text."
+ mean_toxicity_score, _ = Scorer.neural_toxic_score(
+ prediction, model="multilingual"
+ )
+ self.assertTrue(0 <= mean_toxicity_score <= 1)
+
+ def test_neural_toxic_score_default_model(self):
+ prediction = "This is a non-toxic text."
+ mean_toxicity_score, _ = Scorer.neural_toxic_score(prediction)
+ self.assertTrue(0 <= mean_toxicity_score <= 1)
+
+ def test_neural_toxic_score_invalid_model(self):
+ prediction = "This is a non-toxic text."
+ with self.assertRaises(AssertionError):
+ Scorer.neural_toxic_score(prediction, model="invalid_model")
diff --git a/tests/test_similar_ranking.py b/tests/test_similar_ranking.py
deleted file mode 100644
index 123b740ee..000000000
--- a/tests/test_similar_ranking.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Tests for answer relevancy
-"""
-from deepeval.test_case import LLMTestCase
-from deepeval.evaluator import run_test, assert_test
-from deepeval.metrics.ranking_similarity import (
- RankingSimilarity,
- assert_ranking_similarity,
-)
-
-list1 = ["a", "b"]
-list2 = ["b", "c"]
-list3 = ["b", "a"]
-input = "Placeholder Input"
-actual_output = "Placeholder Output"
-
-
-def test_assert_similar_ranking():
- assert_ranking_similarity(
- input=input,
- actual_output=actual_output,
- context=list1,
- retrieval_context=list2,
- minimum_score=0.4,
- )
-
-
-def test_similar_ranking_2():
- scorer = RankingSimilarity(minimum_score=0.5)
- test_case = LLMTestCase(
- input=input,
- actual_output=actual_output,
- context=list1,
- retrieval_context=list2,
- )
- test_case_2 = LLMTestCase(
- input=input,
- actual_output=actual_output,
- context=list1,
- retrieval_context=list3,
- )
- test_results = run_test([test_case, test_case_2], metrics=[scorer])
- assert (
- test_results[1].score > test_results[0].score
- ), "Similarity not working"
-
-
-def test_similar_ranking_3():
- metric = RankingSimilarity(minimum_score=0.3)
- test_case = LLMTestCase(
- input=input,
- actual_output=actual_output,
- context=list1,
- retrieval_context=list2,
- )
- assert_test(test_cases=test_case, metrics=[metric])
diff --git a/tests/test_toxic.py b/tests/test_toxic.py
index 4f856b297..5735f3c2b 100644
--- a/tests/test_toxic.py
+++ b/tests/test_toxic.py
@@ -4,27 +4,25 @@
import pytest
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics.toxic_classifier import NonToxicMetric, assert_non_toxic
+from deepeval.metrics.toxic_classifier import NonToxicMetric
from deepeval.evaluator import assert_test
output = "Winners of the FIFA world cup were the French national football team"
def test_non_toxic():
- assert_non_toxic(
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
- input="Placeholder",
- actual_output="Who won the FIFA World Cup in 2018?",
+ metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.INPUT])
+ test_case = LLMTestCase(
+ input="Who won the FIFA World Cup in 2018?", actual_output="Don't know."
)
+ assert_test(test_case, [metric])
def test_non_toxic_2():
+ test_case = LLMTestCase(input="Placeholder", actual_output="You're stupid")
+ metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT])
with pytest.raises(AssertionError):
- assert_non_toxic(
- evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
- input="Placeholder",
- actual_output="You are stupid!",
- )
+ assert_test(test_case, [metric])
def test_non_toxic_metric():
diff --git a/tests/test_without_pytest.py b/tests/test_without_pytest.py
deleted file mode 100644
index 9cb7ba068..000000000
--- a/tests/test_without_pytest.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Test to make sure assert statements can work
-"""
-from deepeval.metrics.conceptual_similarity import assert_conceptual_similarity
-
-# assert_conceptual_similarity(
-# output="python is a programming language",
-# expected_output="Python is a snake.",
-# minimum_score=0.3,
-# )
diff --git a/tests/utils.py b/tests/utils.py
deleted file mode 100644
index ce0fd093c..000000000
--- a/tests/utils.py
+++ /dev/null
@@ -1,3 +0,0 @@
-def assert_viable_score(score: float):
- assert score >= 0, f"Score {score} is less than 0"
- assert score <= 1, f"Score {score} is greater than 1"