diff --git a/.github/workflows/deepeval-results.yml b/.github/workflows/deepeval-results.yml index 5f77643af..c05abf087 100644 --- a/.github/workflows/deepeval-results.yml +++ b/.github/workflows/deepeval-results.yml @@ -43,6 +43,11 @@ jobs: - name: Check if 'deepeval' script is available run: ls -l $(poetry env info --path)/bin/deepeval || echo "deepeval script not found" + - name: Run deepeval login + env: + CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }} + run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY" + - name: Run deepeval tests and capture output run: poetry run deepeval test run tests/test_quickstart.py > output.txt 2>&1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6de173374..2e4e03476 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -65,4 +65,4 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | - poetry run pytest tests/ --ignore=tests/test_g_eval.py + poetry run pytest tests/ diff --git a/README.md b/README.md index 3f69a1d01..f65632593 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,6 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla - Contextual Recall - Contextual Precision - RAGAS - - Toxicity - Hallucination - Toxicity - Bias diff --git a/deepeval/_version.py b/deepeval/_version.py index 8e29af347..c8a7aba1a 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.49" +__version__: str = "0.20.56" diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py index 7ea86e1bb..3e2e5edf1 100644 --- a/deepeval/cli/test.py +++ b/deepeval/cli/test.py @@ -1,10 +1,11 @@ import pytest import typer import os +import json from typing_extensions import Annotated from typing import Optional from deepeval.test_run import test_run_manager, TEMP_FILE_NAME -from deepeval.utils import delete_file_if_exists +from deepeval.utils import delete_file_if_exists, get_deployment_configs from deepeval.test_run import invoke_test_run_end_hook from deepeval.telemetry import capture_evaluation_count @@ -56,6 +57,11 @@ def run( if exit_on_first_failure: pytest_args.insert(0, "-x") + deployment_configs = get_deployment_configs() + if deployment_configs is not None: + deployment_configs_json = json.dumps(deployment_configs) + pytest_args.extend(["--deployment", deployment_configs_json]) + pytest_args.extend( [ "--verbose" if verbose else "--quiet", diff --git a/deepeval/dataset/api.py b/deepeval/dataset/api.py index a25cbcea6..641967fda 100644 --- a/deepeval/dataset/api.py +++ b/deepeval/dataset/api.py @@ -7,6 +7,7 @@ class Golden(BaseModel): actual_output: Optional[str] = Field(None, alias="actualOutput") expected_output: Optional[str] = Field(None, alias="expectedOutput") context: Optional[list] = Field(None) + retrieval_context: Optional[list] = Field(None, alias="retrievalContext") additional_metadata: Optional[Dict] = Field( None, alias="additionalMetadata" ) diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py index 96da6e235..bc931ca8f 100644 --- a/deepeval/dataset/dataset.py +++ b/deepeval/dataset/dataset.py @@ -3,7 +3,6 @@ from rich.console import Console import json import webbrowser -import os from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase @@ -26,9 +25,20 @@ class EvaluationDataset: test_cases: List[LLMTestCase] goldens: List[Golden] - def __init__(self, test_cases: List[LLMTestCase] = []): - self.test_cases = test_cases - self.goldens = [] + def __init__( + self, + alias: Optional[str] = None, + goldens: Optional[List[Golden]] = None, + test_cases: Optional[List[LLMTestCase]] = None, + ): + if test_cases is not None: + for test_case in test_cases: + test_case.dataset_alias = alias + self.test_cases = test_cases + else: + self.test_cases = [] + self.goldens = goldens or [] + self.alias = alias def add_test_case(self, test_case: LLMTestCase): self.test_cases.append(test_case) @@ -39,6 +49,11 @@ def __iter__(self): def evaluate(self, metrics: List[BaseMetric]): from deepeval import evaluate + if len(self.test_cases) == 0: + raise ValueError( + "No test cases found in evaluation dataset. Unable to evaluate empty dataset." + ) + return evaluate(self.test_cases, metrics) def add_test_cases_from_csv_file( @@ -109,6 +124,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None): actual_output=actual_output, expected_output=expected_output, context=context, + dataset_alias=self.alias, ) ) @@ -171,6 +187,7 @@ def add_test_cases_from_json_file( actual_output=actual_output, expected_output=expected_output, context=context, + dataset_alias=self.alias, ) ) @@ -238,6 +255,7 @@ def add_test_cases_from_hf_dataset( actual_output=actual_output, expected_output=expected_output, context=context, + dataset_alias=self.alias, ) ) @@ -274,6 +292,7 @@ def push(self, alias: str): def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True): if is_confident(): + self.alias = alias api = Api() result = api.get_request( endpoint=Endpoints.DATASET_ENDPOINT.value, @@ -284,10 +303,10 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True): goldens=result["goldens"], ) - self.goldens = response.goldens - if auto_convert_goldens_to_test_cases: - self.test_cases = convert_goldens_to_test_cases(self.goldens) + self.test_cases = convert_goldens_to_test_cases( + response.goldens, alias + ) else: raise Exception( "Run `deepeval login` to pull dataset from Confident AI" diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py index ef6ec40e5..48f27ce8e 100644 --- a/deepeval/dataset/utils.py +++ b/deepeval/dataset/utils.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from deepeval.dataset.api import Golden from deepeval.test_case import LLMTestCase @@ -18,7 +18,9 @@ def convert_test_cases_to_goldens( return goldens -def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]: +def convert_goldens_to_test_cases( + goldens: List[Golden], dataset_alias: Optional[str] = None +) -> List[LLMTestCase]: test_cases = [] for golden in goldens: test_case = LLMTestCase( @@ -26,6 +28,8 @@ def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]: actual_output=golden.actual_output, expected_output=golden.expected_output, context=golden.context, + retrieval_context=golden.retrieval_context, + dataset_alias=dataset_alias, ) test_cases.append(test_case) return test_cases diff --git a/deepeval/integrations/__init__.py b/deepeval/integrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deepeval/integrations/harness/__init__.py b/deepeval/integrations/harness/__init__.py new file mode 100644 index 000000000..f5d330218 --- /dev/null +++ b/deepeval/integrations/harness/__init__.py @@ -0,0 +1 @@ +from deepeval.integrations.harness import DeepEvalHarnessCallback diff --git a/deepeval/integrations/harness/callback.py b/deepeval/integrations/harness/callback.py new file mode 100644 index 000000000..e8896a09f --- /dev/null +++ b/deepeval/integrations/harness/callback.py @@ -0,0 +1,26 @@ +from typing import List, Union + + +# from deepeval.experimental import BaseEvaluationExperiment + +try: + from transformers.trainer_callback import TrainerCallback + + class DeepEvalHarnessCallback(TrainerCallback): + """ + A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval + """ + + def __init__(self, experiments): + super().__init__() + self.experiments = experiments + + raise NotImplementedError("DeepEvalHarnessCallback is WIP") + +except ImportError: + + class DeepEvalHarnessCallback: + def __init__(self, *args, **kwargs): + raise ImportError( + "The 'transformers' library is required to use the DeepEvalHarnessCallback." + ) diff --git a/deepeval/integrations/hugging_face/__init__.py b/deepeval/integrations/hugging_face/__init__.py new file mode 100644 index 000000000..33e444e94 --- /dev/null +++ b/deepeval/integrations/hugging_face/__init__.py @@ -0,0 +1 @@ +from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback diff --git a/deepeval/integrations/hugging_face/callback.py b/deepeval/integrations/hugging_face/callback.py new file mode 100644 index 000000000..7c273d05a --- /dev/null +++ b/deepeval/integrations/hugging_face/callback.py @@ -0,0 +1,236 @@ +from typing import Union, List, Dict +from .utils import get_column_order, generate_test_cases +from .rich_manager import RichManager + +from deepeval.metrics import BaseMetric +from deepeval.evaluate import execute_test +from deepeval.dataset import EvaluationDataset + +try: + from transformers import ( + TrainerCallback, + ProgressCallback, + Trainer, + TrainingArguments, + TrainerState, + TrainerControl, + ) + + class DeepEvalHuggingFaceCallback(TrainerCallback): + """ + Custom callback for deep evaluation during model training. + + Args: + metrics (Union[BaseMetric, List[BaseMetric]]): Evaluation metrics. + evaluation_dataset (EvaluationDataset): Dataset for evaluation. + tokenizer_args (Dict): Arguments for the tokenizer. + aggregation_method (str): Method for aggregating metric scores. + trainer (Trainer): Model trainer. + """ + + def __init__( + self, + metrics: Union[BaseMetric, List[BaseMetric]] = None, + evaluation_dataset: EvaluationDataset = None, + tokenizer_args: Dict = None, + aggregation_method: str = "avg", + trainer: Trainer = None, + show_table: bool = False, + show_table_every: int = 1, + ) -> None: + super().__init__() + + self.show_table = show_table + self.show_table_every = show_table_every + self.metrics = metrics + self.evaluation_dataset = evaluation_dataset + self.tokenizer_args = tokenizer_args + self.aggregation_method = aggregation_method + self.trainer = trainer + + self.task_descriptions = { + "generating": "[blue][STATUS] [white]Generating output from model (might take up few minutes)", + "training": "[blue][STATUS] [white]Training in Progress", + "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)", + "training_end": "[blue][STATUS] [white]Training Ended", + } + + self.train_bar_started = False + self.epoch_counter = 0 + self.deepeval_metric_history = [] + + total_train_epochs = self.trainer.args.num_train_epochs + self.rich_manager = RichManager(show_table, total_train_epochs) + self.trainer.remove_callback(ProgressCallback) + + def _calculate_metric_scores(self) -> Dict[str, List[float]]: + """ + Calculate final evaluation scores based on metrics and test cases. + + Returns: + Dict[str, List[float]]: Metric scores for each test case. + """ + test_results = execute_test( + test_cases=self.evaluation_dataset.test_cases, + metrics=self.metrics, + ) + scores = {} + for test in test_results: + for metric in test.metrics: + metric_name = str(metric.__name__).lower().replace(" ", "_") + metric_score = metric.score + scores.setdefault(metric_name, []).append(metric_score) + + scores = self._aggregate_scores(scores) + return scores + + def _aggregate_scores( + self, scores: Dict[str, List[float]] + ) -> Dict[str, float]: + """ + Aggregate metric scores using the specified method. + + Args: + aggregation_method (str): Method for aggregating scores. + scores (Dict[str, List[float]]): Metric scores for each test case. + + Returns: + Dict[str, float]: Aggregated metric scores. + """ + aggregation_functions = { + "avg": lambda x: sum(x) / len(x), + "max": max, + "min": min, + } + if self.aggregation_method not in aggregation_functions: + raise ValueError( + "Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']" + ) + return { + key: aggregation_functions[self.aggregation_method](value) + for key, value in scores.items() + } + + def on_epoch_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event triggered at the begining of each training epoch. + """ + self.epoch_counter += 1 + + def on_epoch_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event triggered at the end of each training epoch. + """ + control.should_log = True + self.rich_manager.change_spinner_text( + self.task_descriptions["generating"] + ) + test_cases = generate_test_cases( + self.trainer.model, + self.trainer.tokenizer, + self.tokenizer_args, + self.evaluation_dataset, + ) + self.evaluation_dataset.test_cases = test_cases + + def on_log( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event triggered after logging the last logs. + """ + if ( + self.show_table + and len(state.log_history) <= self.trainer.args.num_train_epochs + ): + self.rich_manager.advance_progress() + + if self.epoch_counter % self.show_table_every == 0: + self.rich_manager.change_spinner_text( + self.task_descriptions["evaluate"] + ) + + scores = self._calculate_metric_scores() + self.deepeval_metric_history.append(scores) + self.deepeval_metric_history[-1].update( + state.log_history[-1] + ) + + self.rich_manager.change_spinner_text( + self.task_descriptions["training"] + ) + columns = self._generate_table() + self.rich_manager.update(columns) + + def _generate_table(self): + """ + Generates table, along with progress bars + + Returns: + rich.Columns: contains table and 2 progress bars + """ + column, table = self.rich_manager.create_column() + order = get_column_order(self.deepeval_metric_history[-1]) + + if self.show_table: + for key in order: + table.add_column(key) + + for row in self.deepeval_metric_history: + table.add_row(*[str(row[value]) for value in order]) + + return column + + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event triggered at the end of model training. + """ + self.rich_manager.change_spinner_text( + self.task_descriptions["training_end"] + ) + self.rich_manager.stop() + + def on_train_begin( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + """ + Event triggered at the begining of model training. + """ + self.rich_manager.start() + self.rich_manager.change_spinner_text( + self.task_descriptions["training"] + ) + +except ImportError: + + class DeepEvalHuggingFaceCallback: + def __init__(self, *args, **kwargs): + raise ImportError( + "The 'transformers' library is required to use the DeepEvalHuggingFaceCallback." + ) diff --git a/deepeval/integrations/hugging_face/rich_manager.py b/deepeval/integrations/hugging_face/rich_manager.py new file mode 100644 index 000000000..7729419ac --- /dev/null +++ b/deepeval/integrations/hugging_face/rich_manager.py @@ -0,0 +1,109 @@ +from typing import Union + +from rich.live import Live +from rich.text import Text +from rich.table import Table +from rich.columns import Columns +from rich.console import Console +from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn + + +class RichManager: + def __init__(self, show_table: bool, total_train_epochs: int) -> None: + """ + Initialize RichManager. + + Args: + show_table (bool): Flag to show or hide the table. + total_train_epochs (int): Total number of training epochs. + """ + self.show_table = show_table + self.total_train_epochs = total_train_epochs + self.console = Console() + self.live = Live(auto_refresh=True, console=self.console) + self.train_bar_started = False + + self.progress_bar_columns = [ + TextColumn( + "{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:", + justify="right", + ), + BarColumn(), + TextColumn( + "[green][ {task.completed}/{task.total} epochs ]", + justify="right", + ), + ] + self.spinner_columns = [ + TextColumn("{task.description}", justify="right"), + SpinnerColumn(spinner_name="simpleDotsScrolling"), + ] + + self.empty_column = Text("\n") + + def _initialize_progress_trackers(self) -> None: + """ + Initialize progress trackers (progress and spinner columns). + """ + self.progress = Progress(*self.progress_bar_columns, auto_refresh=False) + self.spinner = Progress(*self.spinner_columns) + + self.progress_task = self.progress.add_task( + "Train Progress", total=self.total_train_epochs + ) + self.spinner_task = self.spinner.add_task("Initializing") + + column_list = [self.spinner, self.progress, self.empty_column] + column_list.insert(0, Table()) if self.show_table else None + + column = Columns(column_list, equal=True, expand=True) + self.live.update(column, refresh=True) + + def change_spinner_text(self, text: str) -> None: + """ + Change the text displayed in the spinner. + + Args: + text (str): Text to be displayed in the spinner. + """ + self.spinner.reset(self.spinner_task, description=text) + + def stop(self) -> None: + """Stop the live display.""" + self.live.stop() + + def start(self) -> None: + """Start the live display and initialize progress trackers.""" + self.live.start() + self._initialize_progress_trackers() + + def update(self, column: Columns) -> None: + """ + Update the live display with a new column. + + Args: + column (Columns): New column to be displayed. + """ + self.live.update(column, refresh=True) + + def create_column(self) -> Union[Columns, Table]: + """ + Create a new column with an optional table. + + Returns: + Tuple[Columns, Table]: Tuple containing the new column and an optional table. + """ + new_table = Table() + + column_list = [self.spinner, self.progress, self.empty_column] + column_list.insert(0, new_table) if self.show_table else None + + column = Columns(column_list, equal=True, expand=True) + return column, new_table + + def advance_progress(self) -> None: + """Advance the progress tracker.""" + if not self.train_bar_started: + self.progress.start() + self.train_bar_started = True + self.progress.update(self.progress_task, advance=1) diff --git a/deepeval/integrations/hugging_face/tests/test_callbacks.py b/deepeval/integrations/hugging_face/tests/test_callbacks.py new file mode 100644 index 000000000..851fd434e --- /dev/null +++ b/deepeval/integrations/hugging_face/tests/test_callbacks.py @@ -0,0 +1,160 @@ +"""Test for callbacks +""" + +from transformers import ( + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + T5Tokenizer, + T5ForConditionalGeneration, + DataCollatorForSeq2Seq, +) + +from datasets import load_dataset + +from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback +from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric +from deepeval.dataset import EvaluationDataset, Golden + +import os +import random + +os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true" +os.environ["OPENAI_API_KEY"] = "API-KEY" + + +def create_prompt(row): + """Merge Context and Question into a single string""" + contexts = row["context"]["contexts"] + question = row["question"] + prompt = f"""{'CONTEXT: ' + str("; ".join(contexts)) if contexts else ''} + QUESTION: {question} + ANSWER:""" + return {"input": prompt, "response": row["long_answer"]} + + +def prepare_dataset(tokenizer, tokenizer_args): + dataset = load_dataset("pubmed_qa", "pqa_labeled") + merged_dataset = dataset.map( + create_prompt, + remove_columns=[ + "question", + "context", + "long_answer", + "pubid", + "final_decision", + ], + ) + + def tokenize_text(dataset, padding="max_length"): + model_input = tokenizer(dataset["input"], **tokenizer_args) + response = tokenizer(dataset["response"], **tokenizer_args) + + if padding == "max_length": + response["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] + for label in response["input_ids"] + ] + + model_input["labels"] = response["input_ids"] + return model_input + + tokenized_dataset = merged_dataset.map( + tokenize_text, remove_columns=["input", "response"] + ) + tokenized_dataset = tokenized_dataset.map( + lambda x: { + "input_ids": x["input_ids"][0], + "labels": x["labels"][0], + "attention_mask": x["attention_mask"][0], + } + ) + return dataset, merged_dataset, tokenized_dataset + + +def create_deepeval_dataset(dataset, sample_size): + total_length = len(dataset) + random_index_list = [ + random.randint(0, total_length) for _ in range(sample_size) + ] + print(random_index_list) + eval_dataset = [dataset[row] for row in random_index_list] + goldens = [] + for row in eval_dataset: + context = ["; ".join(row["context"]["contexts"])] + golden = Golden( + input=row["question"], + expectedOutput=row["long_answer"], + context=context, + retrieval_context=context, + ) + goldens.append(golden) + + return EvaluationDataset(goldens=goldens) + + +if __name__ == "__main__": + # initialize tokenizer + tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small") + + # initalize model + model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small") + model.resize_token_embeddings(len(tokenizer)) + + # create tokenized dataset + tokenizer_args = { + "return_tensors": "pt", + "max_length": 128, + "padding": "max_length", + "truncation": True, + "padding": True, + } + + dataset, merged_dataset, tokenized_dataset = prepare_dataset( + tokenizer, tokenizer_args + ) + + label_pad_token_id = -100 + data_collator = DataCollatorForSeq2Seq( + tokenizer, + model=model, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8, + ) + + repository_id = f"flan-t5-small" + + # Define training args + training_args = Seq2SeqTrainingArguments( + output_dir=repository_id, + overwrite_output_dir=True, + num_train_epochs=50, + per_device_train_batch_size=8, + ) + + # Create Trainer instance + trainer = Seq2SeqTrainer( + model=model, + tokenizer=tokenizer, + args=training_args, + data_collator=data_collator, + train_dataset=tokenized_dataset["train"], + ) + + eval_dataset = create_deepeval_dataset(dataset["train"], sample_size=5) + hallucination_metric = HallucinationMetric(threshold=0.3) + answer_relevancy_metric = AnswerRelevancyMetric( + threshold=0.5, model="gpt-3.5-turbo" + ) + metrics = [hallucination_metric, answer_relevancy_metric] + + # initalize DeepEvalHuggingFaceCallback + callback = DeepEvalHuggingFaceCallback( + metrics=metrics, + evaluation_dataset=eval_dataset, + tokenizer_args=tokenizer_args, + trainer=trainer, + show_table=True, + show_table_every=1, + ) + trainer.add_callback(callback) + trainer.train() diff --git a/deepeval/integrations/hugging_face/utils.py b/deepeval/integrations/hugging_face/utils.py new file mode 100644 index 000000000..fde55fb44 --- /dev/null +++ b/deepeval/integrations/hugging_face/utils.py @@ -0,0 +1,56 @@ +from deepeval.test_case import LLMTestCase +from deepeval.dataset import EvaluationDataset +from deepeval.dataset.utils import convert_goldens_to_test_cases +from typing import List, Dict + + +def get_column_order(scores: Dict) -> List[str]: + """ + Determine the order of columns for displaying scores. + + Args: + scores (Dict): Dictionary containing scores. + + Returns: + List[str]: List of column names in the desired order. + """ + order = ["epoch", "step", "loss", "learning_rate"] + order.extend([key for key in scores.keys() if key not in order]) + return order + + +def generate_test_cases( + model, + tokenizer, + tokenizer_args: Dict, + evaluation_dataset: EvaluationDataset, +) -> List[LLMTestCase]: + """ + Generate test cases based on a language model. + + Args: + model: The language model to generate outputs. + tokenizer: The tokenizer for processing prompts. + tokenizer_args (Dict): Arguments for the tokenizer. + evaluation_dataset (EvaluationDataset): The dataset containing Golden. + + Returns: + List[LLMTestCase]: List of generated test cases. + """ + goldens = evaluation_dataset.goldens + for golden in goldens: + prompt = f"""{'CONTEXT: ' + str("; ".join(golden.context)) if golden.context else ''} + QUESTION: {golden.input} + ANSWER:""" + + tokenized_output = tokenizer(prompt, **tokenizer_args) + input_ids = tokenized_output.input_ids + outputs = model.generate(input_ids) + decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True) + golden.actual_output = decoded_output + + test_cases = convert_goldens_to_test_cases( + goldens=evaluation_dataset.goldens, + dataset_alias=evaluation_dataset.alias, + ) + return test_cases diff --git a/deepeval/integrations/llama_index/__init__.py b/deepeval/integrations/llama_index/__init__.py new file mode 100644 index 000000000..8397dec20 --- /dev/null +++ b/deepeval/integrations/llama_index/__init__.py @@ -0,0 +1,9 @@ +from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler +from deepeval.integrations.llama_index.evaluators import ( + AnswerRelevancyEvaluator as DeepEvalAnswerRelevancyEvaluator, + FaithfulnessEvaluator as DeepEvalFaithfulnessEvaluator, + ContextualRelevancyEvaluator as DeepEvalContextualRelevancyEvaluator, + SummarizationEvaluator as DeepEvalSummarizationEvaluator, + ToxicityEvaluator as DeepEvalToxicityEvaluator, + BiasEvaluator as DeepEvalBiasEvaluator, +) diff --git a/deepeval/tracing/integrations/llama_index.py b/deepeval/integrations/llama_index/callback.py similarity index 100% rename from deepeval/tracing/integrations/llama_index.py rename to deepeval/integrations/llama_index/callback.py diff --git a/deepeval/integrations/llama_index/evaluators.py b/deepeval/integrations/llama_index/evaluators.py new file mode 100644 index 000000000..60ca3163a --- /dev/null +++ b/deepeval/integrations/llama_index/evaluators.py @@ -0,0 +1,295 @@ +import asyncio +from typing import Optional, Sequence, Any +from llama_index.evaluation.base import BaseEvaluator, EvaluationResult + +from deepeval.test_case import LLMTestCase +from deepeval.metrics import ( + AnswerRelevancyMetric, + FaithfulnessMetric, + SummarizationMetric, + ContextualRelevancyMetric, + BiasMetric, + ToxicityMetric, +) +from deepeval.integrations.llama_index.utils import conform_contexts_type + + +class AnswerRelevancyEvaluator(BaseEvaluator): + def __init__( + self, + threshold: float = 0.5, + include_reason: bool = True, + model: Optional[str] = None, + ): + self.threshold = threshold + self.include_reason = include_reason + self.model = model + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None or contexts is None: + raise ValueError("Query, response, and contexts must be provided") + + test_case = LLMTestCase( + input=query, + actual_output=response, + retrieval_context=conform_contexts_type(contexts), + ) + metric = AnswerRelevancyMetric( + threshold=self.threshold, + include_reason=self.include_reason, + model=self.model, + ) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) + + +class FaithfulnessEvaluator(BaseEvaluator): + def __init__( + self, + threshold: float = 0.5, + include_reason: bool = True, + model: Optional[str] = None, + ): + self.threshold = threshold + self.include_reason = include_reason + self.model = model + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None or contexts is None: + raise ValueError("Query, response, and contexts must be provided") + + test_case = LLMTestCase( + input=query, + actual_output=response, + retrieval_context=conform_contexts_type(contexts), + ) + metric = FaithfulnessMetric( + threshold=self.threshold, + include_reason=self.include_reason, + model=self.model, + ) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) + + +class ContextualRelevancyEvaluator(BaseEvaluator): + def __init__( + self, + threshold: float = 0.5, + include_reason: bool = True, + model: Optional[str] = None, + ): + self.threshold = threshold + self.include_reason = include_reason + self.model = model + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None or contexts is None: + raise ValueError("Query, response, and contexts must be provided") + + test_case = LLMTestCase( + input=query, + actual_output=response, + retrieval_context=conform_contexts_type(contexts), + ) + metric = ContextualRelevancyMetric( + threshold=self.threshold, + include_reason=self.include_reason, + model=self.model, + ) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) + + +class SummarizationEvaluator(BaseEvaluator): + def __init__( + self, + threshold: float = 0.5, + model: Optional[str] = None, + ): + self.threshold = threshold + self.model = model + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + del contexts # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None: + raise ValueError("Query and response must be provided") + + test_case = LLMTestCase(input=query, actual_output=response) + metric = SummarizationMetric(threshold=self.threshold, model=self.model) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) + + +class BiasEvaluator(BaseEvaluator): + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + del contexts # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None: + raise ValueError("Query and response must be provided") + + test_case = LLMTestCase( + input=query, + actual_output=response, + ) + metric = BiasMetric(threshold=self.threshold) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) + + +class ToxicityEvaluator(BaseEvaluator): + def __init__(self, threshold: float = 0.5): + self.threshold = threshold + + def _get_prompts(self): + pass + + def _update_prompts(self): + pass + + async def aevaluate( + self, + query: Optional[str] = None, + response: Optional[str] = None, + contexts: Optional[Sequence[str]] = None, + sleep_time_in_seconds: int = 0, + **kwargs: Any, + ) -> EvaluationResult: + del kwargs # Unused + del contexts # Unused + + await asyncio.sleep(sleep_time_in_seconds) + + if query is None or response is None: + raise ValueError("Query and response must be provided") + + test_case = LLMTestCase( + input=query, + actual_output=response, + ) + metric = ToxicityMetric(threshold=self.threshold) + metric.measure(test_case) + return EvaluationResult( + query=query, + response=response, + passing=metric.is_successful(), + score=metric.score, + feedback=metric.reason, + ) diff --git a/deepeval/integrations/llama_index/tests/test_evaluators.py b/deepeval/integrations/llama_index/tests/test_evaluators.py new file mode 100644 index 000000000..45d46c4ca --- /dev/null +++ b/deepeval/integrations/llama_index/tests/test_evaluators.py @@ -0,0 +1,39 @@ +import pytest +from deepeval.integrations.llama_index import ( + DeepEvalAnswerRelevancyEvaluator, + DeepEvalFaithfulnessEvaluator, + DeepEvalContextualRelevancyEvaluator, + DeepEvalSummarizationEvaluator, + DeepEvalBiasEvaluator, + DeepEvalToxicityEvaluator, +) + + +def test_answer_relevancy(): + evaluator = DeepEvalAnswerRelevancyEvaluator() + assert evaluator is not None + + +def test_faithfulness(): + evaluator = DeepEvalFaithfulnessEvaluator() + assert evaluator is not None + + +def test_contextual_relevancy(): + evaluator = DeepEvalContextualRelevancyEvaluator() + assert evaluator is not None + + +def test_summarization(): + evaluator = DeepEvalSummarizationEvaluator() + assert evaluator is not None + + +def test_bias(): + evaluator = DeepEvalBiasEvaluator() + assert evaluator is not None + + +def test_toxicity(): + evaluator = DeepEvalToxicityEvaluator() + assert evaluator is not None diff --git a/deepeval/integrations/llama_index/utils.py b/deepeval/integrations/llama_index/utils.py new file mode 100644 index 000000000..073499c9a --- /dev/null +++ b/deepeval/integrations/llama_index/utils.py @@ -0,0 +1,10 @@ +from typing import Optional, Sequence, List, Union + + +def conform_contexts_type( + contexts: Optional[Sequence[str]] = None, +) -> Union[List[str], None]: + if contexts is None: + return None + + return list(contexts) diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py index 2f3b8f3cc..ccb188b2e 100644 --- a/deepeval/metrics/__init__.py +++ b/deepeval/metrics/__init__.py @@ -25,6 +25,6 @@ # RAGASCoherenceMetric as CoherenceMetric, # RAGASMaliciousnessMetric as MaliciousnessMetric, # ) -from .unbias_metric import UnBiasedMetric -from .non_toxic_metric import NonToxicMetric +from .bias import BiasMetric +from .toxicity import ToxicityMetric from .hallucination_metric import HallucinationMetric diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py index 679ab81ca..1b89ac87f 100644 --- a/deepeval/metrics/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy.py @@ -6,7 +6,7 @@ from deepeval.utils import trimToJson from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.templates import AnswerRelevancyTemplate from deepeval.progress_context import metrics_progress_context @@ -20,11 +20,14 @@ class AnswerRelevancyMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, include_reason: bool = True, ): self.threshold = threshold - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason self.n = 5 @@ -85,7 +88,7 @@ def _generate_reason( ) res = self.model(prompt) - return res.content + return res def _generate_verdicts( self, original_question: str @@ -95,7 +98,7 @@ def _generate_verdicts( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] @@ -115,7 +118,7 @@ def _generate_key_points( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) return data["key_points"] diff --git a/deepeval/metrics/bias.py b/deepeval/metrics/bias.py new file mode 100644 index 000000000..91d1b6fe7 --- /dev/null +++ b/deepeval/metrics/bias.py @@ -0,0 +1,45 @@ +"""Metric for bias classifier - using the same min score subtraction methodology as the toxic classifier +Rationale for bias classifier is described here https://arxiv.org/pdf/2208.05777.pdf +1 - Not Biased +0 - Bias +""" + +from typing import Optional, List +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.scorer import Scorer + + +class BiasMetric(BaseMetric): + def __init__( + self, + model_name: str = "original", + threshold: float = 0.5, + ): # see paper for rationale https://arxiv.org/pdf/2208.05777.pdf + self.model_name = model_name + self.threshold = threshold + + def measure(self, test_case: LLMTestCase): + if test_case.input is None or test_case.actual_output is None: + raise ValueError("Input or actual output cannot be None") + + result = Scorer.neural_bias_score( + test_case.actual_output, model=self.model_name + ) + if result[0]["label"] == "Biased": + bias_score = 0.5 + (result[0]["score"] / 2) + else: + bias_score = 0.5 - (result[0]["score"] / 2) + + self.success = bias_score <= self.threshold + self.score = bias_score + + return self.score + + def is_successful(self) -> bool: + self.success = self.score <= self.threshold + return self.success + + @property + def __name__(self): + return "Bias" diff --git a/deepeval/metrics/contextual_precision.py b/deepeval/metrics/contextual_precision.py index a548b9fbb..f3b22e406 100644 --- a/deepeval/metrics/contextual_precision.py +++ b/deepeval/metrics/contextual_precision.py @@ -6,7 +6,7 @@ from deepeval.utils import trimToJson from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.templates import ContextualPrecisionTemplate from deepeval.progress_context import metrics_progress_context @@ -21,12 +21,15 @@ class ContextualPrecisionMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, include_reason: bool = True, ): self.threshold = threshold self.include_reason = include_reason - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() def measure(self, test_case: LLMTestCase) -> float: @@ -82,7 +85,7 @@ def _generate_reason(self, input: str, score: float): ) res = self.model(prompt) - return res.content + return res def _generate_score(self): # Convert verdicts to a binary list where 'yes' is 1 and others are 0 @@ -122,7 +125,7 @@ def _generate_verdicts( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) verdicts = [ ContextualPrecisionVerdict(**item) for item in data["verdicts"] diff --git a/deepeval/metrics/contextual_recall.py b/deepeval/metrics/contextual_recall.py index 56485cc93..b4a2bd45f 100644 --- a/deepeval/metrics/contextual_recall.py +++ b/deepeval/metrics/contextual_recall.py @@ -6,7 +6,7 @@ from deepeval.utils import trimToJson from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.templates import ContextualRecallTemplate from deepeval.progress_context import metrics_progress_context @@ -20,11 +20,14 @@ class ContextualRecallMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, include_reason: bool = True, ): self.threshold = threshold - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason self.n = 5 @@ -76,7 +79,7 @@ def _generate_reason(self, expected_output: str, score: float): ) res = self.model(prompt) - return res.content + return res def _generate_score(self): if len(self.verdicts) == 0: @@ -96,7 +99,7 @@ def _generate_verdicts( expected_output=expected_output, retrieval_context=retrieval_context ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) verdicts = [ ContextualRecallVerdict(**item) for item in data["verdicts"] diff --git a/deepeval/metrics/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy.py index 38badc1e7..1ef8ea496 100644 --- a/deepeval/metrics/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy.py @@ -7,7 +7,7 @@ from deepeval.utils import trimToJson from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.templates import ContextualRelevancyTemplate from deepeval.progress_context import metrics_progress_context @@ -21,11 +21,14 @@ class ContextualRelevancyMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, include_reason: bool = True, ): self.threshold = threshold - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason @@ -73,7 +76,7 @@ def _generate_reason(self, input: str, score: float): ) res = self.model(prompt) - return res.content + return res def _generate_score(self): irrelevant_sentences = 0 @@ -103,7 +106,7 @@ def _generate_verdicts( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) verdicts = [ ContextualRelevancyVerdict(**item) for item in data["verdicts"] diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py index 642feef5c..8c2da03e5 100644 --- a/deepeval/metrics/cost.py +++ b/deepeval/metrics/cost.py @@ -3,8 +3,8 @@ class CostMetric(BaseMetric): - def __init__(self, threshold: float): - self.threshold = threshold + def __init__(self, max_cost: float): + self.threshold = max_cost def measure(self, test_case: LLMTestCase): self.success = test_case.cost <= self.threshold diff --git a/deepeval/metrics/faithfulness.py b/deepeval/metrics/faithfulness.py index d019b79b4..8d923b15e 100644 --- a/deepeval/metrics/faithfulness.py +++ b/deepeval/metrics/faithfulness.py @@ -7,7 +7,7 @@ from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.utils import trimToJson -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.templates import FaithfulnessTemplate from deepeval.progress_context import metrics_progress_context @@ -22,11 +22,14 @@ class FaithfulnessMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, include_reason: bool = True, ): self.threshold = threshold - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason @@ -85,7 +88,7 @@ def _generate_reason(self, score: float): ) res = self.model(prompt) - return res.content + return res def _generate_truths( self, @@ -95,7 +98,7 @@ def _generate_truths( ): prompt = FaithfulnessTemplate.generate_truths(text=context) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) truths = data["truths"] @@ -134,7 +137,7 @@ def _generate_verdicts( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] diff --git a/deepeval/metrics/g_eval.py b/deepeval/metrics/g_eval.py index 62081e36d..ce60c492d 100644 --- a/deepeval/metrics/g_eval.py +++ b/deepeval/metrics/g_eval.py @@ -9,7 +9,7 @@ evaluation_results_template, ) from deepeval.utils import trimToJson -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from pydantic import BaseModel @@ -26,7 +26,7 @@ def __init__( evaluation_params: List[LLMTestCaseParams], criteria: Optional[str] = None, evaluation_steps: Optional[List[str]] = None, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, threshold: float = 0.5, ): self.name = name @@ -49,7 +49,10 @@ def __init__( ) self.criteria = criteria - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.evaluation_steps = evaluation_steps self.threshold = threshold @@ -87,7 +90,7 @@ def generate_evaluation_steps(self): res = self.model(prompt) - return res.content + return res def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: text = """""" @@ -102,7 +105,7 @@ def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) return data["score"], data["reason"] diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py index a37017a05..6bb2e531e 100644 --- a/deepeval/metrics/latency.py +++ b/deepeval/metrics/latency.py @@ -3,8 +3,8 @@ class LatencyMetric(BaseMetric): - def __init__(self, threshold: float): - self.threshold = threshold + def __init__(self, max_latency: float): + self.threshold = max_latency def measure(self, test_case: LLMTestCase): self.success = test_case.latency <= self.threshold diff --git a/deepeval/metrics/non_toxic_metric.py b/deepeval/metrics/non_toxic_metric.py deleted file mode 100644 index 20c9f78f2..000000000 --- a/deepeval/metrics/non_toxic_metric.py +++ /dev/null @@ -1,71 +0,0 @@ -"""Metric for toxic classifier. -1 - Healthy -0 - Toxic -""" -from typing import List -from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.metrics.base_metric import BaseMetric -from deepeval.scorer import Scorer - - -class NonToxicMetric(BaseMetric): - def __init__( - self, - evaluation_params: List[LLMTestCaseParams], - model_name: str = "original", - threshold: float = 0.5, - ): - if not evaluation_params: - raise ValueError("evaluation_params cannot be empty or None") - - self.evaluation_params = evaluation_params - self.threshold, self.model_name = threshold, model_name - - def __call__(self, test_case: LLMTestCase): - score = self.measure(test_case.actual_output) - score = score["min_score"] - return score - - def measure(self, test_case: LLMTestCase): - # Check if each param in evaluation_params exists and is not None in test_case - for param in self.evaluation_params: - if ( - not hasattr(test_case, param.value) - or getattr(test_case, param.value) is None - ): - raise ValueError( - f"Test case is missing the required attribute: {param.value}" - ) - - total_scores = [] - - for param in self.evaluation_params: - text_to_evaluate = getattr(test_case, param.value) - _, results = Scorer.neural_toxic_score( - prediction=text_to_evaluate, model=self.model_name - ) - # sample output - # {'toxicity': 0.98057544, - # 'severe_toxicity': 0.106649496, - # 'obscene': 0.32923067, - # 'threat': 0.018646086, - # 'insult': 0.7514423, - # 'identity_attack': 0.86643445} - - score = 1 - results["toxicity"] - total_scores.append(score) - - average_score = sum(total_scores) / len(total_scores) - - # Check if the average score meets the minimum requirement - self.success = average_score >= self.threshold - self.score = average_score - return self.score - - def is_successful(self) -> bool: - self.success = self.score >= self.threshold - return self.success - - @property - def __name__(self): - return "Toxicity" diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py index b5c26e7fc..154dbebcf 100644 --- a/deepeval/metrics/ragas.py +++ b/deepeval/metrics/ragas.py @@ -19,7 +19,7 @@ class RAGASContextualPrecisionMetric(BaseMetric): def __init__( self, threshold: float = 0.3, - model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo", + model: Optional[Union[str, BaseChatModel]] = None, ): self.threshold = threshold self.model = GPTModel(model=model) diff --git a/deepeval/metrics/summarization.py b/deepeval/metrics/summarization.py index fb1a005c4..b4b21f953 100644 --- a/deepeval/metrics/summarization.py +++ b/deepeval/metrics/summarization.py @@ -6,12 +6,13 @@ from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.models import GPTModel +from deepeval.models import GPTModel, DeepEvalBaseModel from deepeval.utils import trimToJson from deepeval.templates import ( closed_end_questions_template, closed_end_answers_template, ) +from deepeval.progress_context import metrics_progress_context class ScoreType(Enum): @@ -23,12 +24,15 @@ class SummarizationMetric(BaseMetric): def __init__( self, threshold: float = 0.5, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None, n: Optional[int] = 5, assessment_questions: Optional[List[str]] = None, ): self.threshold = threshold - self.model = GPTModel(model=model) + if isinstance(model, DeepEvalBaseModel): + self.model = model + else: + self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.assessment_questions = assessment_questions self.n = n @@ -39,44 +43,51 @@ def measure(self, test_case: LLMTestCase): if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - source_document = test_case.input - summary = test_case.actual_output + with metrics_progress_context(self.__name__, self.evaluation_model): + source_document = test_case.input + summary = test_case.actual_output - with ThreadPoolExecutor() as executor: - future_alignment = executor.submit( - self.get_score, ScoreType.ALIGNMENT, source_document, summary - ) - future_inclusion = executor.submit( - self.get_score, ScoreType.INCLUSION, source_document, summary - ) + with ThreadPoolExecutor() as executor: + future_alignment = executor.submit( + self.get_score, + ScoreType.ALIGNMENT, + source_document, + summary, + ) + future_inclusion = executor.submit( + self.get_score, + ScoreType.INCLUSION, + source_document, + summary, + ) - # Wait for the results - alignment_score = future_alignment.result() - inclusion_score = future_inclusion.result() + # Wait for the results + alignment_score = future_alignment.result() + inclusion_score = future_inclusion.result() - summarization_score = min(alignment_score, inclusion_score) + summarization_score = min(alignment_score, inclusion_score) - self.success = summarization_score >= self.threshold - self.score_breakdown = { - "Alignment": alignment_score, - "Inclusion": inclusion_score, - } - self.alignment_score = alignment_score - self.inclusion_score = inclusion_score - self.score = summarization_score - return self.score + self.success = summarization_score >= self.threshold + self.score_breakdown = { + "Alignment": alignment_score, + "Inclusion": inclusion_score, + } + self.alignment_score = alignment_score + self.inclusion_score = inclusion_score + self.score = summarization_score + return self.score def get_score( self, score_type: ScoreType, source_document: str, summary: str ): questions = [] if score_type == ScoreType.ALIGNMENT: - print("Calculating alignment score...") + # print("Calculating alignment score...") questions = self.generate_questions( score_type, source_document, summary ) elif score_type == ScoreType.INCLUSION: - print("Calculating inclusion score...") + # print("Calculating inclusion score...") if self.assessment_questions is None: questions = self.generate_questions( score_type, source_document, summary @@ -118,7 +129,7 @@ def generate_questions( ) res = self.model(prompt) - json_output = trimToJson(res.content) + json_output = trimToJson(res) data = json.loads(json_output) return data["questions"] @@ -129,7 +140,7 @@ def get_answer(self, question: str, text: str) -> str: ) res = self.model(prompt) - return res.content + return res def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/toxicity.py b/deepeval/metrics/toxicity.py new file mode 100644 index 000000000..ee6ec1b50 --- /dev/null +++ b/deepeval/metrics/toxicity.py @@ -0,0 +1,40 @@ +from typing import List +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.metrics.base_metric import BaseMetric +from deepeval.scorer import Scorer + + +class ToxicityMetric(BaseMetric): + def __init__( + self, + model_name: str = "original", + threshold: float = 0.5, + ): + self.threshold, self.model_name = threshold, model_name + + def measure(self, test_case: LLMTestCase): + if test_case.input is None or test_case.actual_output is None: + raise ValueError("Input or actual output cannot be None") + _, results = Scorer.neural_toxic_score( + prediction=test_case.actual_output, model=self.model_name + ) + # sample output + # {'toxicity': 0.98057544, + # 'severe_toxicity': 0.106649496, + # 'obscene': 0.32923067, + # 'threat': 0.018646086, + # 'insult': 0.7514423, + # 'identity_attack': 0.86643445} + toxicity_score = results["toxicity"] + + self.success = toxicity_score <= self.threshold + self.score = toxicity_score + return self.score + + def is_successful(self) -> bool: + self.success = self.score <= self.threshold + return self.success + + @property + def __name__(self): + return "Toxicity" diff --git a/deepeval/metrics/unbias_metric.py b/deepeval/metrics/unbias_metric.py deleted file mode 100644 index fb137caab..000000000 --- a/deepeval/metrics/unbias_metric.py +++ /dev/null @@ -1,78 +0,0 @@ -"""Metric for bias classifier - using the same min score subtraction methodology as the toxic classifier -Rationale for bias classifier is described here https://arxiv.org/pdf/2208.05777.pdf -1 - Not Biased -0 - Bias -""" - -from typing import Optional, List -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.scorer import Scorer - - -class UnBiasedMetric(BaseMetric): - def __init__( - self, - evaluation_params: List[LLMTestCaseParams], - model_name: str = "original", - threshold: float = 0.5, - ): # see paper for rationale https://arxiv.org/pdf/2208.05777.pdf - if not evaluation_params: - raise ValueError("evaluation_params cannot be empty or None") - - self.evaluation_params = evaluation_params - self.model_name = model_name - self.threshold = threshold - - def __call__(self, output, expected_output, query: Optional[str] = "-"): - score = self.measure(output, expected_output) - success = score >= self.threshold - return score - - def measure(self, test_case: LLMTestCase, return_all_scores: bool = False): - # Check if each param in evaluation_params exists and is not None in test_case - for param in self.evaluation_params: - if ( - not hasattr(test_case, param.value) - or getattr(test_case, param.value) is None - ): - raise ValueError( - f"Test case is missing the required attribute: {param.value}" - ) - - total_score = 0 # to accumulate scores for all evaluation params - all_results = ( - [] - ) # to accumulate all individual results if return_all_scores is True - - for param in self.evaluation_params: - result = Scorer.neural_bias_score( - getattr(test_case, param.value), model=self.model_name - ) - if return_all_scores: - all_results.append(result) - - if result[0]["label"] == "Biased": - v = 0.5 - (result[0]["score"] / 2) - else: - v = 0.5 + (result[0]["score"] / 2) - total_score += v - - # Calculate the average score - average_score = total_score / len(self.evaluation_params) - - self.success = average_score > self.threshold - self.score = average_score - - if return_all_scores: - return all_results - - return average_score - - def is_successful(self) -> bool: - self.success = self.score >= self.threshold - return self.success - - @property - def __name__(self): - return "Unbiased Metric" diff --git a/deepeval/models/answer_relevancy_model.py b/deepeval/models/answer_relevancy_model.py index 88ee30391..f1be5dca7 100644 --- a/deepeval/models/answer_relevancy_model.py +++ b/deepeval/models/answer_relevancy_model.py @@ -42,7 +42,7 @@ def _call(self, text: str): class CrossEncoderAnswerRelevancyModel(DeepEvalBaseModel): - def __init__(self, model_name: str | None = None): + def __init__(self, model_name: Optional[str] = None): model_name = ( "cross-encoder/nli-deberta-v3-base" if model_name is None diff --git a/deepeval/models/gpt_model.py b/deepeval/models/gpt_model.py index b17f42c15..5ba2038de 100644 --- a/deepeval/models/gpt_model.py +++ b/deepeval/models/gpt_model.py @@ -1,10 +1,9 @@ -import os -from typing import Dict, Optional, Union +from typing import Optional, Union from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_core.language_models import BaseChatModel from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER -from deepeval.models.base import DeepEvalBaseModel +from deepeval.models import DeepEvalBaseModel from deepeval.chat_completion.retry import retry_with_exponential_backoff valid_gpt_models = [ @@ -81,9 +80,9 @@ def load_model(self): return ChatOpenAI(model_name=self.model_name) @retry_with_exponential_backoff - def _call(self, prompt: str): + def _call(self, prompt: str) -> str: chat_model = self.load_model() - return chat_model.invoke(prompt) + return chat_model.invoke(prompt).content def should_use_azure_openai(self): value = KEY_FILE_HANDLER.fetch_data(KeyValues.USE_AZURE_OPENAI) diff --git a/deepeval/models/hallucination_model.py b/deepeval/models/hallucination_model.py index 65c4681bf..6e124c0d9 100644 --- a/deepeval/models/hallucination_model.py +++ b/deepeval/models/hallucination_model.py @@ -1,12 +1,17 @@ import os from typing import Optional from deepeval.singleton import Singleton -from sentence_transformers import CrossEncoder from deepeval.progress_context import progress_context class HallucinationModel(metaclass=Singleton): def __init__(self, model_name: Optional[str] = None): + try: + from sentence_transformers import CrossEncoder + except ImportError: + raise ImportError( + "The 'sentence_transformers' library is required to use the HallucinationMetric." + ) # We use a smple cross encoder model model_name = ( "vectara/hallucination_evaluation_model" diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py index 48aae2614..968e7aa97 100644 --- a/deepeval/plugins/plugin.py +++ b/deepeval/plugins/plugin.py @@ -1,21 +1,45 @@ import pytest import os +import json from rich import print from typing import Optional, Any from deepeval.constants import PYTEST_RUN_TEST_NAME -from deepeval.test_run import test_run_manager +from deepeval.test_run import test_run_manager, DeploymentConfigs def pytest_sessionstart(session: pytest.Session): test_run_manager.save_to_disk = True try: + deployment_configs = session.config.getoption("--deployment") + disable_request = False + + if deployment_configs is None: + deployment = False + else: + deployment = True + deployment_configs = json.loads(deployment_configs) + disable_request = deployment_configs.pop("is_pull_request", False) + deployment_configs = DeploymentConfigs(**deployment_configs) + test_run_manager.create_test_run( - session.config.getoption("file_or_dir")[0] + deployment=deployment, + deployment_configs=deployment_configs, + file_name=session.config.getoption("file_or_dir")[0], + disable_request=disable_request, ) except: test_run_manager.create_test_run() +def pytest_addoption(parser): + parser.addoption( + "--deployment", + action="store", + default=None, + help="Set deployment configs", + ) + + @pytest.hookimpl(tryfirst=True) def pytest_runtest_protocol( item: pytest.Item, nextitem: Optional[pytest.Item] diff --git a/deepeval/test_case.py b/deepeval/test_case.py index 49b20954e..9c8ee7772 100644 --- a/deepeval/test_case.py +++ b/deepeval/test_case.py @@ -22,6 +22,7 @@ def __init__( retrieval_context: Optional[List[str]] = None, latency: Optional[float] = None, cost: Optional[float] = None, + dataset_alias: Optional[str] = None, id: Optional[str] = None, ): self.id = id @@ -32,3 +33,4 @@ def __init__( self.retrieval_context = retrieval_context self.latency = latency self.cost = cost + self.dataset_alias = dataset_alias diff --git a/deepeval/test_run/__init__.py b/deepeval/test_run/__init__.py index 07c3ed0ce..96648fcdb 100644 --- a/deepeval/test_run/__init__.py +++ b/deepeval/test_run/__init__.py @@ -1,2 +1,7 @@ -from .test_run import TestRun, test_run_manager, TEMP_FILE_NAME +from .test_run import ( + TestRun, + test_run_manager, + TEMP_FILE_NAME, + DeploymentConfigs, +) from .hooks import on_test_run_end, invoke_test_run_end_hook diff --git a/deepeval/test_run/api.py b/deepeval/test_run/api.py index 4e45c9ef8..a01c40cbd 100644 --- a/deepeval/test_run/api.py +++ b/deepeval/test_run/api.py @@ -21,6 +21,8 @@ class APITestCase(BaseModel): ..., alias="metricsMetadata" ) run_duration: float = Field(..., alias="runDuration") + latency: Optional[float] = Field(None) + cost: Optional[float] = Field(None) traceStack: Optional[dict] = Field(None) context: Optional[list] = Field(None) retrieval_context: Optional[list] = Field(None, alias="retrievalContext") diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py index 10d3cbd51..190ebc794 100644 --- a/deepeval/test_run/test_run.py +++ b/deepeval/test_run/test_run.py @@ -36,6 +36,14 @@ def from_metric(cls, metric: BaseMetric): return cls(metric=metric.__name__, score=metric.score) +class DeploymentConfigs(BaseModel): + env: str + actor: Optional[str] + branch: Optional[str] + sha: Optional[str] + repo: Optional[str] + + class MetricsAverageDict: def __init__(self): self.metric_dict = {} @@ -64,13 +72,17 @@ class TestRun(BaseModel): None, alias="testFile", ) + dataset_alias: Optional[str] = Field(None, alias="datasetAlias") + deployment: Optional[bool] = Field(True) + deployment_configs: Optional[DeploymentConfigs] = Field( + None, alias="deploymentConfigs" + ) dict_test_cases: Dict[int, APITestCase] = Field( default_factory=dict, ) test_cases: List[APITestCase] = Field( alias="testCases", default_factory=lambda: [] ) - metric_scores: List[MetricScoreType] = Field( default_factory=lambda: [], alias="metricScores" ) @@ -83,6 +95,9 @@ def add_llm_test_case( run_duration: float, index: int, ): + # Set database alias if exists on test case + self.dataset_alias = test_case.dataset_alias + # Check if test case with the same ID already exists test_case_id = id(test_case) existing_test_case: APITestCase = self.dict_test_cases.get( @@ -116,6 +131,8 @@ def add_llm_test_case( success=metric.is_successful(), metricsMetadata=[metric_metadata], runDuration=run_duration, + latency=test_case.latency, + cost=test_case.cost, context=test_case.context, retrievalContext=test_case.retrieval_context, traceStack=get_trace_stack(), @@ -148,21 +165,32 @@ def __init__(self): self.test_run = None self.temp_file_name = TEMP_FILE_NAME self.save_to_disk = False + self.disable_request = False def reset(self): self.test_run = None self.temp_file_name = TEMP_FILE_NAME self.save_to_disk = False + self.disable_request = False def set_test_run(self, test_run: TestRun): self.test_run = test_run - def create_test_run(self, file_name: Optional[str] = None): + def create_test_run( + self, + deployment: Optional[bool] = False, + deployment_configs: Optional[DeploymentConfigs] = None, + file_name: Optional[str] = None, + disable_request: Optional[bool] = False, + ): + self.disable_request = disable_request test_run = TestRun( testFile=file_name, testCases=[], metricScores=[], configurations={}, + deployment=deployment, + deploymentConfigs=deployment_configs, ) self.set_test_run(test_run) @@ -263,7 +291,7 @@ def post_test_run(self, test_run: TestRun): for test_case in test_run.test_cases: test_case.id = None - if is_confident(): + if is_confident() and self.disable_request is False: try: body = test_run.model_dump(by_alias=True, exclude_none=True) except AttributeError: @@ -285,7 +313,8 @@ def post_test_run(self, test_run: TestRun): "✅ Tests finished! View results on " f"[link={link}]{link}[/link]" ) - webbrowser.open(link) + if test_run.deployment == False: + webbrowser.open(link) else: console.print( '✅ Tests finished! Run "deepeval login" to view evaluation results on the web.' diff --git a/deepeval/utils.py b/deepeval/utils.py index 543468007..25e8ee8d6 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -2,7 +2,7 @@ import copy import os import time -from typing import Any +from typing import Any, Optional, Dict from collections.abc import Iterable import tqdm import re @@ -14,6 +14,30 @@ from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER +def get_deployment_configs() -> Optional[Dict]: + if os.getenv("GITHUB_ACTIONS") == "true": + env_info = { + "env": "GitHub Actions", + "actor": os.getenv("GITHUB_ACTOR", None), + "sha": os.getenv("GITHUB_SHA", None), + "repo": os.getenv("GITHUB_REPOSITORY", None), + } + + branch_ref = os.getenv("GITHUB_REF", "") + if branch_ref.startswith("refs/pull/"): + is_pull_request = True + else: + is_pull_request = False + + env_info["is_pull_request"] = is_pull_request + env_info["branch"] = ( + branch_ref.replace("refs/heads/", "") if branch_ref else None + ) + return env_info + + return None + + def is_confident(): confident_api_key = KEY_FILE_HANDLER.fetch_data(KeyValues.API_KEY) return confident_api_key is not None diff --git a/docs/docs/confident-ai-analyze-evaluations.mdx b/docs/docs/confident-ai-analyze-evaluations.mdx index f8f2f6af1..bbeb32cc5 100644 --- a/docs/docs/confident-ai-analyze-evaluations.mdx +++ b/docs/docs/confident-ai-analyze-evaluations.mdx @@ -13,7 +13,7 @@ Confident AI keeps track of your evaluation histories in both development and de ## Visualize Evaluation Results -Once logged in via `deepeval login`, all evaluations executed using `deepeval test run`, `evaluate(dataset, metrics)`, or `dataset.evaluate(metrics)`, will automatically have their results available on Confident. +Once logged in via `deepeval login`, all evaluations executed using `deepeval test run`, `evaluate(...)`, or `dataset.evaluate(...)`, will automatically have their results available on Confident. ![ok](https://d2lsxfc3p6r9rv.cloudfront.net/confident-test-cases.png) diff --git a/docs/docs/confident-ai-evals-in-production.mdx b/docs/docs/confident-ai-evals-in-production.mdx index bc06ebda9..7ff047b0c 100644 --- a/docs/docs/confident-ai-evals-in-production.mdx +++ b/docs/docs/confident-ai-evals-in-production.mdx @@ -14,7 +14,6 @@ Simply add `deepeval.track(...)` in your application to start tracking events. ```python import deepeval -... # At the end of your LLM call deepeval.track( @@ -32,7 +31,6 @@ deepeval.track( fail_silently=True, run_on_background_thread=True ) - ``` The `track()` function takes in the following arguments: diff --git a/docs/docs/confident-ai-github-actions.mdx b/docs/docs/confident-ai-github-actions.mdx new file mode 100644 index 000000000..9870204ee --- /dev/null +++ b/docs/docs/confident-ai-github-actions.mdx @@ -0,0 +1,46 @@ +--- +id: confident-ai-github-actions +title: Evals in GitHub Actions +sidebar_label: Evals in GitHub Actions +--- + +## Quick Summary + +Confident AI allows you to monitor evaluation results in CI/CD pipelines using GitHub Actions, specifically on pushes to the repository. To set this up, simply execute `deepeval test run` within your workflow defined in a YAML file located in the `.github/workflows/` directory of your GitHub repository. + +:::info +Confident is currently integrated with GitHub Actions. +::: + +## Setup Evals for GitHub Actions + +`deepeval` tracks evaluations ran in GitHub Actions for push events only. To begin, define an evaluation dataset/test cases in a test file and execute it via `deepeval test run` in a GitHub workflow YAML file: + +```yaml title=".github/workflows/llm-evaluations.yml" +name: LLM Deployment Evaluations + +# Make sure to include push events +on: + push: + +jobs: + test: + runs-on: ubuntu-latest + steps: + # Some extra steps to setup and install dependencies + ... + + - name: Login to Confident + env: + CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }} + run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY" + + - name: Run deepeval tests + run: poetry run deepeval test run test_file.py +``` + +:::note +Your workflow file does **NOT** have to be same as the example shown above. In the example, we used poetry and GitHub secrets to store and access our API key, which is not a strict requirement. +::: + +**Congratulations!** With this setup, `deepeval` will automatically log evaluation results to your project's deployments page on Confident AI. diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx index c857962ac..d12b280f9 100644 --- a/docs/docs/evaluation-test-cases.mdx +++ b/docs/docs/evaluation-test-cases.mdx @@ -182,13 +182,14 @@ The `latency` is an **optional** parameter that represents how long it took your test_case = LLMTestCase( input="...", actual_output="...", - # Replace this with the actual latency of your LLM application + # Replace this with the actual latency it took your + # LLM (application) to generate the actual output latency=10.4 ) ``` -:::note -`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `latency` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `latency` parameter. +:::info +The only `deepeval` metric that uses the `latency` parameter is the [`LatencyMetric`.](metrics-latency) ::: ## Cost @@ -199,13 +200,14 @@ The `cost` is an **optional** parameter that represents the token cost for a giv test_case = LLMTestCase( input="...", actual_output="...", - # Replace this with the actual latency of your LLM application + # Replace this with the actual cost it took your + # LLM (application) to generate the actual output cost=0.78 ) ``` :::info -`deepeval` does not offer cost and latency metrics because it is difficult to account for all different units and currencies available. We highly encourage you to look at the [custom metrics section](metrics-custom#implementation) for a full example on how to create your own metric if you are looking to evaluate cost and latency. +Similar to the `LatencyMetric`, the [`CostMetric`](metrics-cost) is the only `deepeval` metric that uses the `cost` parameter. ::: ## Run A Test Case @@ -345,4 +347,4 @@ metric = HallucinationMetric(threshold=0.7) evaluate(test_cases, [metric]) ``` -Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evalute`, visit the [datasets section](evaluation-datasets). +Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evaluate`, visit the [datasets section](evaluation-datasets). diff --git a/docs/docs/integrations-huggingface.mdx b/docs/docs/integrations-huggingface.mdx new file mode 100644 index 000000000..d5ee2198a --- /dev/null +++ b/docs/docs/integrations-huggingface.mdx @@ -0,0 +1,117 @@ +--- +# id: integrations-huggingface +title: Hugging Face +sidebar_label: Hugging Face +--- + +## Quick Summary + +`DeepEvalHuggingFaceCallback` is a custom huggingface's `transformers.TrainerCallback` for in-depth evaluation of LLM/LM models during training/fine-tuning using `transformers.Trainer`. + +## Usage + +### Importing the Necessary Components + +```python +from transformers import ( + Seq2SeqTrainer, + Seq2SeqTrainingArguments, + T5Tokenizer, + T5ForConditionalGeneration, + DataCollatorForSeq2Seq, +) + +from datasets import load_dataset + +from deepeval.integrations import DeepEvalHuggingFaceCallback +from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric +from deepeval.dataset import EvaluationDataset, Golden +``` + +### Initializing Metrics and Evaluation Dataset + +```python +# Define evaluation metrics +hallucination_metric = HallucinationMetric(threshold=0.3) +answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5) +metrics = [hallucination_metric, answer_relevancy_metric] + +# Define goldens and eval_dataset +goldens = [Golden(...), Golden(...), Golden(...)] +eval_dataset = EvaluationDataset(goldens=goldens) +``` + +### Initialize `transformers` Trainer and Tokenizer + +```python + +# Load training Dataset +training_dataset = load_dataset('DATASET') + +# Initalize tokenizer and model +tokenizer = T5Tokenizer.from_pretrained("MODEL-ID") +model = T5ForConditionalGeneration.from_pretrained("MODEL-ID") + +tokenizer_args = {...} +``` + +### Initalize `transformers.Trainer` + +```python +# Define training args +training_args = Seq2SeqTrainingArguments( + output_dir="OUTPUT-DIR", + overwrite_output_dir=True, + num_train_epochs=50, + per_device_train_batch_size=8, +) + +# Create Trainer instance (Seq2SeqTrainer is a child of Trainer) +trainer = Seq2SeqTrainer( + model=model, + tokenizer=tokenizer, + args=training_args, + train_dataset=training_dataset, +) +``` + +### Initalize `DeepeEvalCallback` and begin Training + +```python +callback = DeepEvalHuggingFaceCallback( + metrics=metrics, + evaluation_dataset=eval_dataset, + tokenizer_args=tokenizer_args, + trainer=trainer, + show_table=True, + show_table_every=1, +) + +# Add the callback to the Trainer +trainer.add_callback(callback) + +# Start model training +trainer.train() +``` + +## Reference + +### `DeepEvalHuggingFaceCallback` Class + +#### Attributes + +- **`show_table`**: Flag indicating whether to display a table with evaluation metric scores. +- **`show_table_every`**: Frequency of displaying the evaluation table. +- **`metrics`**: Evaluation metrics used during training. +- **`evaluation_dataset`**: Dataset for evaluation. +- **`tokenizer_args`**: Arguments for the tokenizer. +- **`aggregation_method`**: Method for aggregating metric scores for multiple Goldens. +- **`trainer`**: transformers.trainer instance. + +#### Methods + +- **`on_epoch_begin`**: Triggered at the beginning of each training epoch. +- **`on_epoch_end`**: Triggered at the end of each training epoch. +- **`on_log`**: Triggered after logging the last logs. +- **`on_train_end`**: Triggered at the end of model training. +- **`on_train_begin`**: Triggered at the beginning of model training. diff --git a/docs/docs/integrations-introduction.mdx b/docs/docs/integrations-introduction.mdx index 5924b184c..73be7b434 100644 --- a/docs/docs/integrations-introduction.mdx +++ b/docs/docs/integrations-introduction.mdx @@ -8,8 +8,7 @@ sidebar_label: Introduction `deepeval` offers multiple integrations for those who have already built LLM apps using other frameworks. We currently support: -- lLamaindex -- _LangChain (to be documented...)_ -- _Guardrails (to be documented...)_ +- LlamaIndex +- _Hugging Face (to be documented...)_ -You're by no means required to leverage these integrations as `deepeval` is not vendor locked into any framework. After all, all we need is data being passed to `deepeval` to evaluate your LLM application. However, you may find our integrations helpful in keeping your codebase cleaner, so we recommend giving it a try if you're looking to optimize for readability and maintability. +You're by no means required to leverage these integrations as `deepeval` is not vendor locked-in into any framework. After all, all we need are data from test cases to evaluate your LLM application. However, you may find our integrations helpful in keeping your codebase cleaner, so we recommend giving it a try if you're looking to optimize for readability and maintability. diff --git a/docs/docs/integrations-llamaindex.mdx b/docs/docs/integrations-llamaindex.mdx index af5549153..072677460 100644 --- a/docs/docs/integrations-llamaindex.mdx +++ b/docs/docs/integrations-llamaindex.mdx @@ -1,100 +1,211 @@ --- id: integrations-llamaindex -title: Evaluating LlamaIndex +title: LlamaIndex sidebar_label: LlamaIndex --- ## Quick Summary -DeepEval integrates nicely with LlamaIndex's `ResponseEvaluator` class. Below is an example of the factual consistency documentation. +LlamaIndex is a data framework for LLMs that facilitates the ingestion of data from various sources such as APIs, databases, and PDFs, and indexes it for later retrieval in RAG-based LLM applications. + +## Evaluating LlamaIndex (RAG) Applications + +RAG applications built using LlamaIndex can be easily evaluated within `deepeval`. Lets use this RAG application built using Llamaindex as an example: + +```python +from llama_index import VectorStoreIndex, SimpleDirectoryReader + +# Consult the LlamaIndex docs if you're unsure what this does +documents = SimpleDirectoryReader("YOUR_DATA_DIRECTORY").load_data() +index = VectorStoreIndex.from_documents(documents) +rag_application = index.as_query_engine() +``` + +You can then query your RAG application and evaluate each response using `deepeval`'s metrics: + +```python +from deepeval.metrics import AnswerRelevancyMetric +from deepeval.test_case import LLMTestCase +... + +# An example input to your RAG application +user_input = "What is LlamaIndex?" + +# LlamaIndex returns a response object that contains +# both the output string and retrieved nodes +response_object = rag_application.query(user_input) + +# Process the response object to get the output string +# and retrieved nodes +if response_object is not None: + actual_output = response_object.response + retrieval_context = [node.get_content() for node in response.source_nodes] + +# Create a test case and metric as usual +test_case = LLMTestCase( + input=user_input, + actual_output=actual_output, + retrieval_context=retrieval_context +) +answer_relevancy_metric = AnswerRelevancyMetric() + +# Evaluate +answer_relevancy_metric.measure(test_case) +print(answer_relevancy_metric.score) +print(answer_relevancy_metric.reason) +``` + +:::info +You can also extract all necessary outputs and retrieval contexts for each given input to your LlamaIndex application to [create an `EvaluationDataset` to evaluate test cases in bulk.](evaluation-datasets) +::: + +## Using DeepEval for LlamaIndex + +In LlamaIndex, there are entities known as evaluators that evaluates the responses of LlamaIndex applications. Continuing from the previous example, here's an alternative way to make use of the `AnswerRelevancyMetric` through `deepeval`'s LlamaIndex evaluators: ```python -from llama_index.response.schema import Response -from typing import List -from llama_index.schema import Document -from deepeval.metrics import HallucinationMetric -from llama_index import ( - TreeIndex, - VectorStoreIndex, - SimpleDirectoryReader, - LLMPredictor, - ServiceContext, - Response, +from deepeval.integrations.llamaindex import DeepEvalAnswerRelevancyEvaluator +... + +# An example input to your RAG application +user_input = "What is LlamaIndex?" + +# LlamaIndex returns a response object that contains +# both the output string and retrieved nodes +response_object = rag_application.query(user_input) + +evaluator = DeepEvalAnswerRelevancyEvaluator() +evaluation_result = evaluator.evaluate_response( + query=user_input, + response=response_object ) -from llama_index.llms import OpenAI -from llama_index.evaluation import ResponseEvaluator +print(evaluation_result) +``` -import os -import openai +:::note +In LlamaIndex's documentation, you might see examples where the `evaluate()` method is called on an evaluator instead of the `evaluate_response()` method. While both is correct, you should **ALWAYS** use the `evaluate_response()` methods when using `deepeval`'s LlamaIndex evaluators. +::: -api_key = "sk-XXX" -openai.api_key = api_key +### Answer Relevancy -gpt4 = OpenAI(temperature=0, model="gpt-4", api_key=api_key) -service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4) -evaluator_gpt4 = ResponseEvaluator(service_context=service_context_gpt4) +The `DeepEvalAnswerRelevancyEvaluator` uses `deepeval`'s `AnswerRelevancyMetric` for evaluation. + +```python +from deepeval.integrations.llamaindex import DeepEvalAnswerRelevancyEvaluator + +evaluator = DeepEvalAnswerRelevancyEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5, + # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'. + model="gpt-4-1106-preview", + # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`. + include_reason=True +) ``` -Getting a lLamaHub Loader +### Faithfulness + +The `DeepEvalFaithfulnessEvaluator` uses `deepeval`'s `FaithfulnessMetric` for evaluation. ```python -from llama_index import download_loader +from deepeval.integrations.llamaindex import DeepEvalFaithfulnessEvaluator + +evaluator = DeepEvalFaithfulnessEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5, + # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'. + model="gpt-4-1106-preview", + # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`. + include_reason=True +) +``` + +### Contextual Relevancy -WikipediaReader = download_loader("WikipediaReader") +The `DeepEvalContextualRelevancyEvaluator` uses `deepeval`'s `ContextualRelevancyMetric` for evaluation. -loader = WikipediaReader() -documents = loader.load_data(pages=['Tokyo']) -tree_index = TreeIndex.from_documents(documents=documents) -vector_index = VectorStoreIndex.from_documents( - documents, service_context=service_context_gpt4 +```python +from deepeval.integrations.llamaindex import DeepEvalContextualRelevancyEvaluator + +evaluator = DeepEvalContextualRelevancyEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5, + # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'. + model="gpt-4-1106-preview", + # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`. + include_reason=True ) ``` -We then build an evaluator based on the BaseEvaluator class that requires an evaluate method. +### Summarization -In this example, we show you how to write a factual consistency check. +The `DeepEvalSummarizationEvaluator` uses `deepeval`'s `SummarizationMetric` for evaluation. ```python -from deepeval.test_case import LLMTestCase -from deepeval.metrics import HallucinationMetric +from deepeval.integrations.llamaindex import DeepEvalSummarizationEvaluator -class HallucinationResponseEvaluator: - def get_context(self, response: Response) -> List[Document]: - """Get context information from given Response object using source nodes. +evaluator = DeepEvalSummarizationEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5, + # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'. + model="gpt-4-1106-preview" +) +``` - Args: - response (Response): Response object from an index based on the query. +### Bias - Returns: - List of Documents of source nodes information as context information. - """ - context = [] +The `DeepEvalBiasEvaluator` uses `deepeval`'s `BiasMetric` for evaluation. + +```python +from deepeval.integrations.llamaindex import DeepEvalBiasEvaluator - for context_info in response.source_nodes: - context.append(Document(text=context_info.node.get_content())) +evaluator = DeepEvalBiasEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5 +) +``` - return context +### Toxicity - def evaluate(self, response: Response) -> str: +The `DeepEvalToxicityEvaluator` uses `deepeval`'s `ToxicityMetric` for evaluation. - # Evaluate factual consistency metrics - answer = str(response) - metric = HallucinationMetric() - context = self.get_context(response) - test_case = LLMTestCase(input="This is an example input", context=context, actual_output=answer) - score = metric.measure(test_case=test_case) - if metric.is_successful(): - return "YES" - else: - return "NO" +```python +from deepeval.integrations.llamaindex import DeepEvalToxicityEvaluator -evaluator = HallucinationResponseEvaluator() +evaluator = DeepEvalToxicityEvaluator( + # Optional. A float representing the minimum passing threshold, defaulted to 0.5. + threshold=0.5 +) ``` -You can then evaluate as such: +## Metrics vs Evaluators + +While both `deepeval`'s metrics and evaluators yield the same result, `deepeval` is a full evaluation suite built specifically for LLM evaluation. Naturally, `deepeval` forces you to follow evaluation best practices, something not accomplishable through the use of the evaluators abstraction. + +So while both metrics and evaluators can be used for a one-off, standalone evaluation, metrics: + +- can be combined to evaluate multiple criteria asynchronously +- can be used to evaluate entire `EvaluationDataset`s +- can leverage `deepeval`'s native Pytest integration to unit test LlamaIndex applications in CI/CD pipelines +- can be used with any framework, meaning you are not vendor locked-in into LlamaIndex +- covers a wider range of evaluation criteria/use cases +- automatically integrates with [Confident AI](confident-ai-introduction), which offers evaluation analysis, evaluation debugging, dataset management, and real-time evaluations in production + +:::note +The only upside of using `deepeval`'s LlamaIndex evaluators instead of metrics, is an evaluator automatically extracts the `retrieval_context` from a LlamaIndex response. However, as shown in previous examples, manually extracting the `retrieval_context` from a LlamaIndex response is extremely straightforward: ```python -query_engine = tree_index.as_query_engine() -response = query_engine.query("How did Tokyo get its name?") -eval_result = evaluator.evaluate(response) +... + +# LlamaIndex returns a response object that contains +# both the output string and retrieved nodes +response_object = rag_application.query(user_input) + +# Process the response object to get the output string +# and retrieved nodes +if response_object is not None: + actual_output = response_object.response + retrieval_context = [node.get_content() for node in response.source_nodes] ``` + +::: diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx index b5d0d60ff..acad4a95b 100644 --- a/docs/docs/metrics-bias.mdx +++ b/docs/docs/metrics-bias.mdx @@ -4,7 +4,7 @@ title: Bias sidebar_label: Bias --- -The bias metric determines whether your LLM has gender, racial, or political bias in whatever parameters you want to evaluate it on. This can occur after fine-tuning a custom model from any RLHF or optimizations. +The bias metric determines whether your LLM output contains gender, racial, or political bias. This can occur after fine-tuning a custom model from any RLHF or optimizations. :::info Bias in `deepeval` is a **referenceless** metric. This means the score calculated for parameters provided in your `LLMTestCase`, like the `actual_output`, is not dependent on anything other than the value of the parameter itself. @@ -20,35 +20,28 @@ pip install Dbias ## Required Arguments -To use the `UnBiasedMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`: +To use the `BiasMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`: - `input` - `actual_output` ## Example -Unlike other metrics you've encountered to far, the `UnBiasedMetric` requires an extra parameter named evaluation_params. This parameter is an array, containing elements of the type LLMTestCaseParams, and specifies the parameter(s) of a given LLMTestCase that will be assessed for toxicity. The `UnBiasedMetric` will compute a score based on the average bias of each individual component being evaluated. - ```python -from deepeval import evaluate -from deepeval.metrics import UnBiasedMetric -from deepeval.test_case import LLMTestCase, LLMTestCaseParams - -# Replace this with the actual output from your LLM application -actual_output = "We offer a 30-day full refund at no extra cost." +from deepeval.metrics import BiasMetric +from deepeval.test_case import LLMTestCase -metric = UnBiasedMetric( - evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT], - threshold=0.5 -) +metric = BiasMetric(threshold=0.5) test_case = LLMTestCase( input="What if these shoes don't fit?", - actual_output=actual_output, + # Replace this with the actual output from your LLM application + actual_output = "We offer a 30-day full refund at no extra cost." ) metric.measure(test_case) print(metric.score) - -# or evaluate test cases in bulk -evaluate([test_case], [metric]) ``` + +:::note +Unlike other metrics you've seen so far, the `threshold` for the `BiasMetric` is instead a maxmium threshold. +::: diff --git a/docs/docs/metrics-cost.mdx b/docs/docs/metrics-cost.mdx index 5923ebdf8..880f5ff86 100644 --- a/docs/docs/metrics-cost.mdx +++ b/docs/docs/metrics-cost.mdx @@ -21,7 +21,7 @@ from deepeval import evaluate from deepeval.metrics import CostMetric from deepeval.test_case import LLMTestCase -metric = CostMetric(threshold=0.4) +metric = CostMetric(max_cost=0.4) test_case = LLMTestCase( input="...", actual_output="...", @@ -29,10 +29,10 @@ test_case = LLMTestCase( ) metric.measure(test_case) -# True if cost <= threshold +# True if cost <= max_cost print(metric.is_successful()) ``` :::note -Similar to `LatencyMetric`, the `CostMetric` threshold does **NOT** have any standard units. However, you need to make sure the monetary units you provide in the `cost` argument when creating an `LLMTestCase` matches that of the cost `threshold`. +Similar to `LatencyMetric`, the `CostMetric` threshold, `max_cost`, does **NOT** have any standard units. However, you need to make sure the monetary units you provide in the `cost` argument when creating an `LLMTestCase` matches that of the cost `max_cost`. ::: diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx index 58e2618e4..8f0387628 100644 --- a/docs/docs/metrics-introduction.mdx +++ b/docs/docs/metrics-introduction.mdx @@ -79,19 +79,55 @@ deepeval unset-azure-openai We highly discourage the use of custom LLMs since evaluation requires a high level of reasoning capabilities that we find are generally not reachable apart from (Azure) OpenAI's GPT models. -But to use a custom LLM for evaluation, `deepeval` metrics currently supports all of langchain's [Chat Models](https://python.langchain.com/docs/integrations/chat/), which you can provide through the `model` argument when instantiating an LLM-based metric: +That being said,`deepeval` allows you to use **ANY** custom LLM for evaluation. This includes LLMs through langchain's [Chat Models](https://python.langchain.com/docs/integrations/chat/), or even LLMs in GGML format, which you can specify through the `model` argument when instantiating an LLM-based metric. Here is an example of using a custom Azure OpenAI model through langchain's `AzureChatOpenAI` interface for evaluation: ```python from langchain_openai import AzureChatOpenAI +from deepeval.models.base import DeepEvalBaseModel +class CustomEvaluationModel(DeepEvalBaseModel): + def __init__( + self, + model + ): + self.model = model + + def load_model(self): + return self.model + + def _call(self, prompt: str) -> str: + chat_model = self.load_model() + return chat_model.invoke(prompt) + + def get_model_name(self): + return "Custom Azure OpenAI Model" + +# Replace these with real values custom_azure_openai_model = AzureChatOpenAI( openai_api_version=openai_api_version, azure_deployment=azure_deployment, azure_endpoint=azure_endpoint, openai_api_key=openai_api_key, ) +custom_evaluation_model = CustomEvaluationModel(model=chat_model) +print(custom_model("Write me a joke")) +``` + +Remember, when creating a custom LLM evaluation model you should always: + +- inherit `DeepEvalBaseModel`. +- implement the `load_model()` method, which will be responsible for returning a model object. +- implement the `_call()` method with **one and only one** parameter of type string that acts as the prompt to your custom LLM. +- the `_call()` method should return the final output string of your custom LLM. Note that we called `chat_model.invoke(prompt)` in this particular example, but this could be different depending on the implementation of your custom LLM object. +- the `get_model_name()` method simply returns a string representing the name of your LLM model. + +Note that the `model` argument in the `__init__()` method can accept any type (the model string or object itself). Lastly, to use it for evaluation in LLM-based metrics: + +```python +from deepeval.metrics import AnswerRelevancyMetric +... -answer_relevancy_metric = AnswerRelevancyMetric(model=custom_azure_openai_model) +metric = AnswerRelevancyMetric(model=custom_evaluation_model) ``` :::note diff --git a/docs/docs/metrics-latency.mdx b/docs/docs/metrics-latency.mdx index fcf3688f7..ccde3f3e6 100644 --- a/docs/docs/metrics-latency.mdx +++ b/docs/docs/metrics-latency.mdx @@ -7,7 +7,7 @@ sidebar_label: Latency The latency metric measures whether the completion time of your LLM (application) is efficient and meets the expected time limits. It is one of the two performance metrics offered by `deepeval`. :::info -Performance metrics in `deepeval` are metrics that evaluate aspects such as latency and cost, rather than the outputs of LLM (applications). +Performance metrics in `deepeval` are metrics that evaluate aspects such as latency and cost, rather than the outputs of your LLM (application). ::: ## Required Arguments @@ -25,7 +25,7 @@ from deepeval import evaluate from deepeval.metrics import LatencyMetric from deepeval.test_case import LLMTestCase -metric = LatencyMetric(threshold=10.0) +metric = LatencyMetric(max_latency=10.0) test_case = LLMTestCase( input="...", actual_output="...", @@ -33,12 +33,12 @@ test_case = LLMTestCase( ) metric.measure(test_case) -# True if latency <= threshold +# True if latency <= max_latency print(metric.is_successful()) ``` :::note -It does not matter what unit of time you provide the `threshold` argument with, it only has to match the unit of `latency` when creating an `LLMTestCase`. +It does not matter what unit of time you provide the `max_latency` argument with, it only has to match the unit of `latency` when creating an `LLMTestCase`. ::: diff --git a/docs/docs/metrics-ragas.mdx b/docs/docs/metrics-ragas.mdx index 9b0537ec5..808b0125a 100644 --- a/docs/docs/metrics-ragas.mdx +++ b/docs/docs/metrics-ragas.mdx @@ -56,3 +56,16 @@ There are three optional parameters when creating a `RagasMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-3.5-turbo'. + +:::note +You can also choose to import and evaluate using each metric individually: + +```python +from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric +from deepeval.metrics.ragas import RAGASFaithfulnessMetric +from deepeval.metrics.ragas import RAGASContextualRecallMetric +from deepeval.metrics.ragas import RAGASContextualPrecisionMetric +``` + +These metrics accept the same arguments as the `RagasMetric`. +::: diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx index 754a4f0fd..84e46dfbc 100644 --- a/docs/docs/metrics-toxicity.mdx +++ b/docs/docs/metrics-toxicity.mdx @@ -4,7 +4,7 @@ title: Toxicity sidebar_label: Toxicity --- -The toxicity metric is another **referenceless** metric that evaluates toxicness in your LLM's outputs. This is particularly useful for a fine-tuning use case. +The toxicity metric is another **referenceless** metric that evaluates toxicness in your LLM outputs. This is particularly useful for a fine-tuning use case. ## Installation @@ -16,35 +16,28 @@ pip install detoxify ## Required Arguments -To use the `NonToxicMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`: +To use the `ToxicityMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`: - `input` - `actual_output` ## Example -Also being a referenceless like `UnBiasedMetric`, the `NonToxicMetric` similarily requires an extra parameter named `evaluation_params`. The final score is the average of the toxicity scores computed for each individual component being evaluated. - ```python -from deepeval import evaluate -from deepeval.metrics import NonToxicMetric -from deepeval.test_case import LLMTestCase, LLMTestCaseParams - -# Replace this with the actual output from your LLM application -actual_output = "We offer a 30-day full refund at no extra cost." +from deepeval.metrics import ToxicityMetric +from deepeval.test_case import LLMTestCase -metric = NonToxicMetric( - evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT], - threshold=0.5 -) +metric = ToxicityMetric(threshold=0.5) test_case = LLMTestCase( input="What if these shoes don't fit?", - actual_output=actual_output, + # Replace this with the actual output from your LLM application + actual_output = "We offer a 30-day full refund at no extra cost." ) metric.measure(test_case) print(metric.score) - -# or evaluate test cases in bulk -evaluate([test_case], [metric]) ``` + +:::note +Similar to the `BiasMetric`, the `threshold` in toxicity is a maxmium threshold. +::: diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js index e7566e0a1..c97fe6dfc 100644 --- a/docs/docusaurus.config.js +++ b/docs/docusaurus.config.js @@ -90,6 +90,12 @@ const config = { }, ], }, + algolia: { + appId: '7U9PQIW1ZA', + apiKey: 'fb799aeac8bcd0f6b9e0e233a385ad33', + indexName: 'confident-ai', + contextualSearch: true, + }, colorMode: { defaultMode: 'dark', disableSwitch: false, diff --git a/docs/sidebars.js b/docs/sidebars.js index 8809f489c..6b2f2b369 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -4,7 +4,7 @@ module.exports = { type: 'category', label: 'Getting Started', items: [ - 'getting-started', + 'getting-started', ], collapsed: false, }, @@ -19,23 +19,23 @@ module.exports = { type: 'category', label: 'Metrics', items: [ - 'metrics-introduction', - 'metrics-llm-evals', - 'metrics-summarization', - 'metrics-answer-relevancy', - 'metrics-faithfulness', - 'metrics-contextual-precision', - 'metrics-contextual-relevancy', - 'metrics-contextual-recall', - 'metrics-ragas', - 'metrics-latency', - 'metrics-cost', - 'metrics-hallucination', - 'metrics-bias', - 'metrics-toxicity', - 'metrics-judgemental', - 'metrics-custom', - 'metrics-others', + 'metrics-introduction', + 'metrics-llm-evals', + 'metrics-summarization', + 'metrics-answer-relevancy', + 'metrics-faithfulness', + 'metrics-contextual-precision', + 'metrics-contextual-relevancy', + 'metrics-contextual-recall', + 'metrics-ragas', + 'metrics-latency', + 'metrics-cost', + 'metrics-hallucination', + 'metrics-bias', + 'metrics-toxicity', + 'metrics-judgemental', + 'metrics-custom', + 'metrics-others', ], collapsed: false, }, @@ -51,6 +51,7 @@ module.exports = { 'confident-ai-evaluate-datasets', 'confident-ai-analyze-evaluations', 'confident-ai-debug-evaluations', + 'confident-ai-github-actions', 'confident-ai-evals-in-production' ], collapsed: false, @@ -60,7 +61,8 @@ module.exports = { label: 'Integrations', items: [ 'integrations-introduction', - 'integrations-llamaindex' + 'integrations-llamaindex', + 'integrations-huggingface' ], collapsed: false, }, diff --git a/llama_test/chatbot.py b/llama_test/chatbot.py index a07724b5e..cf5b2babc 100644 --- a/llama_test/chatbot.py +++ b/llama_test/chatbot.py @@ -2,13 +2,15 @@ from llama_index import ServiceContext import llama_index -llama_index.set_global_handler("deepeval") - -service_context = ServiceContext.from_defaults(chunk_size=1000) +# llama_index.set_global_handler("deepeval") +service_context = ServiceContext.from_defaults(chunk_size=500) documents = SimpleDirectoryReader("data").load_data() index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine(similarity_top_k=5) def query(user_input): - return query_engine.query(user_input).response + res = query_engine.query(user_input) + # evaluator = ToxicityEvaluator() + # result = evaluator.evaluate_response(query=user_input, response=res) + return res.response diff --git a/pyproject.toml b/pyproject.toml index e46e16a4f..f3ccfdbf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.49" +version = "0.20.56" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" diff --git a/tests/test_bias.py b/tests/test_bias.py index b1d16cd5d..c800a82d0 100644 --- a/tests/test_bias.py +++ b/tests/test_bias.py @@ -4,13 +4,13 @@ # import pytest # from deepeval.test_case import LLMTestCase, LLMTestCaseParams -# from deepeval.metrics import UnBiasedMetric +# from deepeval.metrics import BiasMetric # from deepeval import assert_test # output = "Winners of the FIFA world cup were the French national football team" # TODO: uncomment once unbias fixed # def test_non_bias_metric(): -# metric = UnBiasedMetric(evaluation_params=[LLMTestCaseParams.INPUT]) +# metric = BiasMetric(evaluation_params=[LLMTestCaseParams.INPUT]) # test_case = LLMTestCase(input="placeholder", actual_output=output) # assert_test(test_case, [metric]) diff --git a/tests/test_cost.py b/tests/test_cost.py deleted file mode 100644 index ca9b6ae8d..000000000 --- a/tests/test_cost.py +++ /dev/null @@ -1,9 +0,0 @@ -from deepeval.metrics import CostMetric -from deepeval.test_case import LLMTestCase -from deepeval import assert_test - - -def test_cost_metric(): - metric = CostMetric(threshold=12) - test_case = LLMTestCase(input="...", actual_output="...", cost=12) - assert_test(test_case, [metric]) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d99ba8e98..d932c7bed 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -3,13 +3,12 @@ import pytest from deepeval.dataset import EvaluationDataset from deepeval.metrics import HallucinationMetric -from deepeval import assert_test +from deepeval import assert_test, evaluate from deepeval.test_case import LLMTestCase -dataset = EvaluationDataset() - def test_create_dataset(): + dataset = EvaluationDataset() module_b_dir = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(module_b_dir, "data", "dataset.csv") @@ -32,10 +31,27 @@ def test_create_dataset(): ) assert len(dataset.test_cases) == 10, "Test Cases not loaded from JSON" - # dataset.push("alias") + +# test_case = LLMTestCase( +# input="What if these shoes don't fit?", +# # Replace this with the actual output from your LLM application +# actual_output="We offer a 30-day full refund at no extra costs.", +# context=["All customers are eligible for a 30 day full refund at no extra costs."] +# ) +# dataset = EvaluationDataset(alias="123", test_cases=[test_case]) + +# @pytest.mark.parametrize( +# "test_case", +# dataset, +# ) +# def test_test_dataset(test_case: LLMTestCase): +# metric = HallucinationMetric(threshold=0.5) +# assert_test(test_case, [metric]) -# dataset.pull("alias") +# dataset = EvaluationDataset() +# dataset.pull("Testa") +# print(dataset.test_cases) # @pytest.mark.parametrize( # "test_case", # dataset, diff --git a/tests/test_g_eval.py b/tests/test_g_eval.py index e4c09a8d3..aa10d3adc 100644 --- a/tests/test_g_eval.py +++ b/tests/test_g_eval.py @@ -1,12 +1,11 @@ import pytest -import openai from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import GEval from deepeval import assert_test -def test_chat_completion(): - """Test Chat Completion""" +@pytest.mark.skip(reason="openai is expensive") +def test_g_eval(): metric = GEval( name="Validity", criteria="The response is a valid response to the prompt.", diff --git a/tests/test_hallucination_metric.py b/tests/test_hallucination_metric.py index 13560d23c..f1d2df6a5 100644 --- a/tests/test_hallucination_metric.py +++ b/tests/test_hallucination_metric.py @@ -12,6 +12,8 @@ def test_hallucination_metric(): context=[ "A man with blond-hair, and a brown shirt drinking out of a public water fountain." ], + cost=0.4, + latency=2, ) assert_test(test_case, [metric]) @@ -22,6 +24,8 @@ def test_hallucination_metric_2(): input="placeholder", actual_output="Python is a programming language.", context=["Python is NOT a programming language."], + cost=1, + latency=0.2, ) with pytest.raises(AssertionError): assert_test(test_case, [metric]) @@ -33,6 +37,8 @@ def test_hallucination_metric_3(): input="placeholder", actual_output="Python is a programming language.", context=["Python is a snake."], + cost=0.1, + latency=13.0, ) with pytest.raises(AssertionError): assert_test(test_case, [metric]) diff --git a/tests/test_latency.py b/tests/test_latency.py deleted file mode 100644 index c8fc05dd5..000000000 --- a/tests/test_latency.py +++ /dev/null @@ -1,13 +0,0 @@ -from deepeval.metrics import LatencyMetric -from deepeval.test_case import LLMTestCase -from deepeval import assert_test - - -def test_latency_metric(): - metric = LatencyMetric(threshold=12) - test_case = LLMTestCase( - input="...", - actual_output="...", - latency=8.3, - ) - assert_test(test_case, [metric]) diff --git a/tests/test_performance.py b/tests/test_performance.py new file mode 100644 index 000000000..a3be64baa --- /dev/null +++ b/tests/test_performance.py @@ -0,0 +1,19 @@ +from deepeval.metrics import LatencyMetric, CostMetric +from deepeval.test_case import LLMTestCase +from deepeval import assert_test + + +def test_cost_metric(): + metric = CostMetric(max_cost=12) + test_case = LLMTestCase(input="...", actual_output="...", cost=12) + assert_test(test_case, [metric]) + + +def test_latency_metric(): + metric = LatencyMetric(max_latency=12) + test_case = LLMTestCase( + input="...", + actual_output="...", + latency=8.3, + ) + assert_test(test_case, [metric]) diff --git a/tests/test_rag_metrics.py b/tests/test_rag_metrics.py index 19a626388..b12969ab7 100644 --- a/tests/test_rag_metrics.py +++ b/tests/test_rag_metrics.py @@ -18,4 +18,4 @@ def test_answer_relevancy_again(): ) relevancy_metric = AnswerRelevancyMetric(threshold=0.5) faithfulness_metric = FaithfulnessMetric(threshold=0.5) - assert_test(test_case, [faithfulness_metric]) + assert_test(test_case, [faithfulness_metric, relevancy_metric]) diff --git a/tests/test_toxic.py b/tests/test_toxic.py index 821fc7e72..523f0e2ca 100644 --- a/tests/test_toxic.py +++ b/tests/test_toxic.py @@ -4,14 +4,14 @@ # import pytest # from deepeval.test_case import LLMTestCase, LLMTestCaseParams -# from deepeval.metrics import NonToxicMetric +# from deepeval.metrics import ToxicityMetric # from deepeval import assert_test # output = "Winners of the FIFA world cup were the French national football team" # TODO: uncomment once detoxify fixed # def test_non_toxic(): -# metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.INPUT]) +# metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.INPUT]) # test_case = LLMTestCase( # input="Who won the FIFA World Cup in 2018?", actual_output="Don't know." # ) @@ -20,12 +20,12 @@ # def test_non_toxic_2(): # test_case = LLMTestCase(input="Placeholder", actual_output="You're stupid") -# metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]) +# metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]) # with pytest.raises(AssertionError): # assert_test(test_case, [metric]) # def test_non_toxic_metric(): -# metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]) +# metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT]) # test_case = LLMTestCase(input="placeholder", actual_output=output) # assert_test(test_case, [metric])