From e5da502d24a430312944f13c8e933b308c0444c1 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 17 Dec 2023 16:13:11 +0530
Subject: [PATCH 01/74] FEAT: Adding callbacks functionality

---
 deepeval/callbacks/__init__.py             |   1 +
 deepeval/callbacks/base_callback.py        | 105 +++++++++++++++++++++
 deepeval/callbacks/huggingface_callback.py |   0
 3 files changed, 106 insertions(+)
 create mode 100644 deepeval/callbacks/__init__.py
 create mode 100644 deepeval/callbacks/base_callback.py
 create mode 100644 deepeval/callbacks/huggingface_callback.py

diff --git a/deepeval/callbacks/__init__.py b/deepeval/callbacks/__init__.py
new file mode 100644
index 000000000..5f94d58f8
--- /dev/null
+++ b/deepeval/callbacks/__init__.py
@@ -0,0 +1 @@
+from deepeval.callbacks.base_callback import BaseCallback
\ No newline at end of file
diff --git a/deepeval/callbacks/base_callback.py b/deepeval/callbacks/base_callback.py
new file mode 100644
index 000000000..dbc1721a1
--- /dev/null
+++ b/deepeval/callbacks/base_callback.py
@@ -0,0 +1,105 @@
+from abc import ABC, abstractmethod
+
+class BaseCallback(ABC):
+    """
+    Base class for training callbacks in deepeval.
+
+    Attributes:
+        - metrics (list): List of metrics to be evaluated.
+        - evaluate_every (int): Frequency of metric evaluation during training.
+    """
+
+    def __init__(self, metrics=None, evaluate_every=1):
+        """
+        Initialize the BaseCallback.
+
+        Args:
+            metrics (list, optional): List of metrics to be evaluated.
+            evaluate_every (int, optional): Frequency of metric evaluation during training.
+        """
+        self.metrics = metrics or []
+        self.evaluate_every = evaluate_every
+
+    @abstractmethod
+    def on_epoch_begin(self, trainer, epoch, logs=None):
+        """
+        Called at the beginning of each epoch.
+
+        Args:
+            trainer: The training framework's trainer object.
+            epoch (int): Current epoch.
+            logs (dict, optional): Dictionary to store additional information.
+        """
+        pass
+
+    @abstractmethod
+    def on_epoch_end(self, trainer, epoch, logs=None):
+        """
+        Called at the end of each epoch.
+
+        Args:
+            trainer: The training framework's trainer object.
+            epoch (int): Current epoch.
+            logs (dict, optional): Dictionary to store additional information.
+        """
+        pass
+
+    @abstractmethod
+    def on_batch_begin(self, trainer, batch, logs=None):
+        """
+        Called at the beginning of each batch.
+
+        Args:
+            trainer: The training framework's trainer object.
+            batch: Current batch.
+            logs (dict, optional): Dictionary to store additional information.
+        """
+        pass
+
+    @abstractmethod
+    def on_batch_end(self, trainer, batch, logs=None):
+        """
+        Called at the end of each batch.
+
+        Args:
+            trainer: The training framework's trainer object.
+            batch: Current batch.
+            logs (dict, optional): Dictionary to store additional information.
+        """
+        pass
+
+    def evaluate_metrics(self, trainer):
+        """
+        Evaluate metrics based on the specified frequency.
+
+        Args:
+            trainer: The training framework's trainer object.
+
+        Returns:
+            dict: Dictionary containing metric results.
+        """
+        pass
+
+    @abstractmethod
+    def compute_metric(self, trainer, metric):
+        """
+        Compute the value of a specific metric.
+
+        Args:
+            trainer: The training framework's trainer object.
+            metric (str): The metric to be computed.
+
+        Returns:
+            float: Computed metric value.
+        """
+        pass
+
+    def log_metrics(self, epoch, metrics_results):
+        """
+        Log the evaluated metrics.
+
+        Args:
+            epoch (int): Current epoch.
+            metrics_results (dict): Dictionary containing metric results.
+        """
+        pass
diff --git a/deepeval/callbacks/huggingface_callback.py b/deepeval/callbacks/huggingface_callback.py
new file mode 100644
index 000000000..e69de29bb

From b87c5eb527257c51f499a172d48dd20124e5b799 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Thu, 21 Dec 2023 20:38:36 +0530
Subject: [PATCH 02/74] CHORE: Structure change

removed base_callback and using transformer's TrainerCallback as base_callback
---
 deepeval/callbacks/__init__.py                |   1 -
 deepeval/callbacks/base_callback.py           | 105 ------------------
 deepeval/callbacks/huggingface/__init__.py    |   2 +
 .../huggingface/deepeval_callback.py          |  38 +++++++
 .../huggingface/deepeval_harness_callback.py  |  14 +++
 .../utils.py}                                 |   0
 6 files changed, 54 insertions(+), 106 deletions(-)
 delete mode 100644 deepeval/callbacks/base_callback.py
 create mode 100644 deepeval/callbacks/huggingface/__init__.py
 create mode 100644 deepeval/callbacks/huggingface/deepeval_callback.py
 create mode 100644 deepeval/callbacks/huggingface/deepeval_harness_callback.py
 rename deepeval/callbacks/{huggingface_callback.py => huggingface/utils.py} (100%)

diff --git a/deepeval/callbacks/__init__.py b/deepeval/callbacks/__init__.py
index 5f94d58f8..e69de29bb 100644
--- a/deepeval/callbacks/__init__.py
+++ b/deepeval/callbacks/__init__.py
@@ -1 +0,0 @@
-from deepeval.callbacks.base_callback import BaseCallback
\ No newline at end of file
diff --git a/deepeval/callbacks/base_callback.py b/deepeval/callbacks/base_callback.py
deleted file mode 100644
index dbc1721a1..000000000
--- a/deepeval/callbacks/base_callback.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from abc import ABC, abstractmethod
-
-class BaseCallback(ABC):
-    """
-    Base class for training callbacks in deepeval.
-
-    Attributes:
-        - metrics (list): List of metrics to be evaluated.
-        - evaluate_every (int): Frequency of metric evaluation during training.
-    """
-
-    def __init__(self, metrics=None, evaluate_every=1):
-        """
-        Initialize the BaseCallback.
-
-        Args:
-            metrics (list, optional): List of metrics to be evaluated.
-            evaluate_every (int, optional): Frequency of metric evaluation during training.
-        """
-        self.metrics = metrics or []
-        self.evaluate_every = evaluate_every
-
-    @abstractmethod
-    def on_epoch_begin(self, trainer, epoch, logs=None):
-        """
-        Called at the beginning of each epoch.
-
-        Args:
-            trainer: The training framework's trainer object.
-            epoch (int): Current epoch.
-            logs (dict, optional): Dictionary to store additional information.
-        """
-        pass
-
-    @abstractmethod
-    def on_epoch_end(self, trainer, epoch, logs=None):
-        """
-        Called at the end of each epoch.
-
-        Args:
-            trainer: The training framework's trainer object.
-            epoch (int): Current epoch.
-            logs (dict, optional): Dictionary to store additional information.
-        """
-        pass
-
-    @abstractmethod
-    def on_batch_begin(self, trainer, batch, logs=None):
-        """
-        Called at the beginning of each batch.
-
-        Args:
-            trainer: The training framework's trainer object.
-            batch: Current batch.
-            logs (dict, optional): Dictionary to store additional information.
-        """
-        pass
-
-    @abstractmethod
-    def on_batch_end(self, trainer, batch, logs=None):
-        """
-        Called at the end of each batch.
-
-        Args:
-            trainer: The training framework's trainer object.
-            batch: Current batch.
-            logs (dict, optional): Dictionary to store additional information.
-        """
-        pass
-
-    def evaluate_metrics(self, trainer):
-        """
-        Evaluate metrics based on the specified frequency.
-
-        Args:
-            trainer: The training framework's trainer object.
-
-        Returns:
-            dict: Dictionary containing metric results.
-        """
-        pass
-
-    @abstractmethod
-    def compute_metric(self, trainer, metric):
-        """
-        Compute the value of a specific metric.
-
-        Args:
-            trainer: The training framework's trainer object.
-            metric (str): The metric to be computed.
-
-        Returns:
-            float: Computed metric value.
-        """
-        pass
-
-    def log_metrics(self, epoch, metrics_results):
-        """
-        Log the evaluated metrics.
-
-        Args:
-            epoch (int): Current epoch.
-            metrics_results (dict): Dictionary containing metric results.
-        """
-        pass
diff --git a/deepeval/callbacks/huggingface/__init__.py b/deepeval/callbacks/huggingface/__init__.py
new file mode 100644
index 000000000..9f7612c02
--- /dev/null
+++ b/deepeval/callbacks/huggingface/__init__.py
@@ -0,0 +1,2 @@
+from deepeval.callbacks.huggingface.deepeval_callback import DeepEvalCallback
+from deepeval.callbacks.huggingface.deepeval_harness_callback import DeepEvalHarnessCallback
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
new file mode 100644
index 000000000..c4384fbe2
--- /dev/null
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -0,0 +1,38 @@
+from typing import Union, List
+
+from deepeval.metrics import BaseMetric
+from deepeval.dataset import EvaluationDataset
+from transformers.trainer_callback import TrainerCallback
+
+
+class DeepEvalCallback(TrainerCallback):
+    """
+    A [transformers.TrainerCallback] that logs various LLM evaluation metrics to DeepEval
+    """
+    
+    def __init__(self, metrics: Union[BaseMetric, List[BaseMetric]], evaluation_dataset: EvaluationDataset):
+        super().__init__()
+        self.metrics = metrics
+        self.evaluation_dataset = evaluation_dataset
+        
+    
+        
+    def on_epoch_end(self, args, state, control, model, tokenizer, **kwargs):
+        # if self.eval_steps is not None and state.global_step % self.eval_steps == 0:
+        #     input_text = "What if these shoes don't fit?"
+        #     context = ["All customers are eligible for a 30 day full refund at no extra costs."]
+        #     actual_output = "We offer a 30-day full refund at no extra costs."
+
+        #     # Replace with actual logic for metric calculation
+        #     hallucination_metric = HallucinationMetric(minimum_score=0.7)
+        #     test_case = LLMTestCase(input=input_text, actual_output=actual_output, context=context)
+        #     assert_test(test_case, [hallucination_metric])
+
+        #     # Log or save the metric values as needed
+        print("---------ONE EPOCH ENDED---------")
+        print(model)
+
+    def on_train_end(self, args, state, control, model, tokenizer, **kwargs):
+        print("---------TRAIN ENDED---------")
+        print(model)
+        
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/deepeval_harness_callback.py b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
new file mode 100644
index 000000000..52c1ff6f1
--- /dev/null
+++ b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
@@ -0,0 +1,14 @@
+from typing import List, Union
+
+from transformers.trainer_callback import TrainerCallback
+from deepeval.experimental import BaseEvaluationExperiment
+
+
+class DeepEvalHarnessCallback(TrainerCallback):
+    """
+    A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
+    """
+    
+    def __init__(self, experiments: Union[BaseEvaluationExperiment, List[BaseEvaluationExperiment]]):
+        super().__init__()
+        self.experiments = experiments
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface_callback.py b/deepeval/callbacks/huggingface/utils.py
similarity index 100%
rename from deepeval/callbacks/huggingface_callback.py
rename to deepeval/callbacks/huggingface/utils.py

From 73962f10e9f7406b4d2d65e8619c0c54c0797a1b Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Fri, 22 Dec 2023 08:08:40 +0530
Subject: [PATCH 03/74] CHORE: Created some methods for on_epoch_end,
 'DeepEvalCallback'

---
 deepeval/callbacks/huggingface/__init__.py    |   2 +-
 .../huggingface/deepeval_callback.py          | 127 +++++++++++++++---
 2 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/deepeval/callbacks/huggingface/__init__.py b/deepeval/callbacks/huggingface/__init__.py
index 9f7612c02..664d17cf8 100644
--- a/deepeval/callbacks/huggingface/__init__.py
+++ b/deepeval/callbacks/huggingface/__init__.py
@@ -1,2 +1,2 @@
 from deepeval.callbacks.huggingface.deepeval_callback import DeepEvalCallback
-from deepeval.callbacks.huggingface.deepeval_harness_callback import DeepEvalHarnessCallback
\ No newline at end of file
+# from deepeval.callbacks.huggingface.deepeval_harness_callback import DeepEvalHarnessCallback
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index c4384fbe2..ba36fed0a 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,38 +1,121 @@
-from typing import Union, List
+from typing import Union, List, Dict
 
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
+from deepeval.evaluate import execute_test
+
 from transformers.trainer_callback import TrainerCallback
 
 
+# TODO:
+#   1. Dataset has to be created dynamically
+#   2. add score to default dict returned by on_epoch_end
+#   3. Make code more presentable
+
 class DeepEvalCallback(TrainerCallback):
     """
-    A [transformers.TrainerCallback] that logs various LLM evaluation metrics to DeepEval
+    A Transformers TrainerCallback that logs various Language Model (LM) evaluation metrics to DeepEval.
     """
-    
-    def __init__(self, metrics: Union[BaseMetric, List[BaseMetric]], evaluation_dataset: EvaluationDataset):
+
+    def __init__(
+        self,
+        metrics: Union[BaseMetric, List[BaseMetric]] = None,
+        evaluation_dataset: EvaluationDataset = None,
+        tokenizer_args: Dict = {},
+        aggregation_method: str = "average",
+    ):
+        """
+        Initialize the DeepEvalCallback.
+
+        Args:
+            metrics (Union[BaseMetric, List[BaseMetric]], optional): Evaluation metrics to calculate.
+                Defaults to None.
+            evaluation_dataset (EvaluationDataset, optional): Dataset for evaluation. Defaults to None.
+            tokenizer_args (Dict, optional): Additional arguments for tokenizer. Defaults to {}.
+            aggregation_method (str, optional): Aggregation method for metric scores ("average", "max", "min").
+                Defaults to "average".
+        """
         super().__init__()
         self.metrics = metrics
         self.evaluation_dataset = evaluation_dataset
-        
-    
-        
-    def on_epoch_end(self, args, state, control, model, tokenizer, **kwargs):
-        # if self.eval_steps is not None and state.global_step % self.eval_steps == 0:
-        #     input_text = "What if these shoes don't fit?"
-        #     context = ["All customers are eligible for a 30 day full refund at no extra costs."]
-        #     actual_output = "We offer a 30-day full refund at no extra costs."
+        self.tokenizer_args = tokenizer_args
+        self.aggregation_method = aggregation_method
+
+    def _calculate_scores(self) -> Dict[str, List[float]]:
+        """
+        Calculate scores for each evaluation metric.
+
+        Returns:
+            Dict[str, List[float]]: Dictionary containing metric names and corresponding scores.
+        """
+        test_results = execute_test(
+            test_cases=self.evaluation_dataset.test_cases,
+            metrics=self.metrics
+        )
+
+        scores = {}
+        for test in test_results:
+            for metric in test.metrics:
+                metric_name = str(metric.__name__).lower().replace(" ", "_")
+                metric_score = metric.score
+                scores.setdefault(metric_name, []).append(metric_score)
+
+        return scores
 
-        #     # Replace with actual logic for metric calculation
-        #     hallucination_metric = HallucinationMetric(minimum_score=0.7)
-        #     test_case = LLMTestCase(input=input_text, actual_output=actual_output, context=context)
-        #     assert_test(test_case, [hallucination_metric])
+    def _aggregate_scores(
+        self, aggregation_method: str, 
+        scores: Dict[str, List[float]]
+    ) -> Dict[str, float]:
+        """
+        Aggregate metric scores based on the specified method.
 
-        #     # Log or save the metric values as needed
-        print("---------ONE EPOCH ENDED---------")
-        print(model)
+        Args:
+            aggregation_method (str): Aggregation method ("average", "max", "min").
+            scores (Dict[str, List[float]]): Dictionary containing metric names and scores.
+
+        Returns:
+            Dict[str, float]: Dictionary containing aggregated metric names and scores.
+        """
+        if aggregation_method in ["average", "avg"]:
+            scores = {key: (sum(value) / len(value)) for key, value in scores.items()}
+        elif aggregation_method == "max":
+            scores = {key: (max(value)) for key, value in scores.items()}
+        elif aggregation_method == "min":
+            scores = {key: (min(value)) for key, value in scores.items()}
+        else:
+            raise ValueError("Incorrect 'aggregation_method' passed, only accepts ['avg', 'min, 'max']")
+        return scores
+
+    def on_epoch_end(self, args, state, control, model, tokenizer, **kwargs):
+        """
+        Called at the end of each training epoch.
+
+        Args:
+            args: Training arguments.
+            state: Training state.
+            control: Training control.
+            model: The current model.
+            tokenizer: Tokenizer used for evaluation.
+            kwargs: Additional keyword arguments.
+        """
+        scores = self._calculate_scores()
+        scores = self._aggregate_scores(self.aggregation_method, scores)
+        print(scores)
 
     def on_train_end(self, args, state, control, model, tokenizer, **kwargs):
-        print("---------TRAIN ENDED---------")
-        print(model)
-        
\ No newline at end of file
+        """
+        Called at the end of the training process.
+
+        Args:
+            args: Training arguments.
+            state: Training state.
+            control: Training control.
+            model: The final model.
+            tokenizer: Tokenizer used for evaluation.
+            kwargs: Additional keyword arguments.
+        """
+        print("---------TRAIN END---------")
+
+
+
+# REWRITE THIS IN A BETTER PROFESSIONAL WAY, MAKE DOCS AND ERRORS MORE PROFESSIONAL, MAKE CODE BETTER, OPTIMIZE THE CODE AS MUCH AS YOU CAN TO LOOK BETTER
\ No newline at end of file

From 77af0ccd97eda4571b66ec5661061749ead6f111 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Mon, 25 Dec 2023 00:00:49 +0530
Subject: [PATCH 04/74] FEAT: Added custom metrics table for CLI

---
 .../huggingface/deepeval_callback.py          | 172 +++++++++++-------
 1 file changed, 102 insertions(+), 70 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index ba36fed0a..aec929dc8 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,121 +1,153 @@
 from typing import Union, List, Dict
 
+from rich.console import Console
+from rich.table import Table
+from rich.live import Live
+from transformers import TrainerCallback, \
+    ProgressCallback, Trainer, \
+    TrainingArguments, TrainerState, TrainerControl
+    
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
 
-from transformers.trainer_callback import TrainerCallback
-
-
-# TODO:
-#   1. Dataset has to be created dynamically
-#   2. add score to default dict returned by on_epoch_end
-#   3. Make code more presentable
 
 class DeepEvalCallback(TrainerCallback):
     """
-    A Transformers TrainerCallback that logs various Language Model (LM) evaluation metrics to DeepEval.
+    Custom callback for deep evaluation during model training.
+
+    Args:
+        metrics (Union[BaseMetric, List[BaseMetric]]): Evaluation metrics.
+        evaluation_dataset (EvaluationDataset): Dataset for evaluation.
+        tokenizer_args (Dict): Arguments for the tokenizer.
+        aggregation_method (str): Method for aggregating metric scores.
+        trainer (Trainer): Model trainer.
     """
 
     def __init__(
         self,
         metrics: Union[BaseMetric, List[BaseMetric]] = None,
         evaluation_dataset: EvaluationDataset = None,
-        tokenizer_args: Dict = {},
-        aggregation_method: str = "average",
-    ):
-        """
-        Initialize the DeepEvalCallback.
-
-        Args:
-            metrics (Union[BaseMetric, List[BaseMetric]], optional): Evaluation metrics to calculate.
-                Defaults to None.
-            evaluation_dataset (EvaluationDataset, optional): Dataset for evaluation. Defaults to None.
-            tokenizer_args (Dict, optional): Additional arguments for tokenizer. Defaults to {}.
-            aggregation_method (str, optional): Aggregation method for metric scores ("average", "max", "min").
-                Defaults to "average".
-        """
+        tokenizer_args: Dict = None,
+        aggregation_method: str = "avg",
+        trainer: Trainer = None
+    ) -> None:
         super().__init__()
         self.metrics = metrics
         self.evaluation_dataset = evaluation_dataset
         self.tokenizer_args = tokenizer_args
         self.aggregation_method = aggregation_method
+        self.trainer = trainer
+        
+        self.epoch_counter = 0
+        self.log_history = []
+        self._initiate_rich_console()
+        
+    def _initiate_rich_console(self) -> None:
+        """
+        Initiate rich console for progress tracking.
+        """
+        console = Console()
+        self.live = Live(auto_refresh=True, console=console)
+        self.trainer.remove_callback(ProgressCallback)
 
-    def _calculate_scores(self) -> Dict[str, List[float]]:
+    def _calculate_metric_scores(self) -> Dict[str, List[float]]:
         """
-        Calculate scores for each evaluation metric.
+        Calculate final evaluation scores based on metrics and test cases.
 
         Returns:
-            Dict[str, List[float]]: Dictionary containing metric names and corresponding scores.
+            Dict[str, List[float]]: Metric scores for each test case.
         """
         test_results = execute_test(
             test_cases=self.evaluation_dataset.test_cases,
             metrics=self.metrics
         )
-
         scores = {}
         for test in test_results:
             for metric in test.metrics:
                 metric_name = str(metric.__name__).lower().replace(" ", "_")
                 metric_score = metric.score
                 scores.setdefault(metric_name, []).append(metric_score)
-
+                
+        scores = self._aggregate_scores(scores)
         return scores
 
-    def _aggregate_scores(
-        self, aggregation_method: str, 
+    def _aggregate_scores(self,
         scores: Dict[str, List[float]]
     ) -> Dict[str, float]:
         """
-        Aggregate metric scores based on the specified method.
+        Aggregate metric scores using the specified method.
 
         Args:
-            aggregation_method (str): Aggregation method ("average", "max", "min").
-            scores (Dict[str, List[float]]): Dictionary containing metric names and scores.
+            aggregation_method (str): Method for aggregating scores.
+            scores (Dict[str, List[float]]): Metric scores for each test case.
 
         Returns:
-            Dict[str, float]: Dictionary containing aggregated metric names and scores.
+            Dict[str, float]: Aggregated metric scores.
         """
-        if aggregation_method in ["average", "avg"]:
-            scores = {key: (sum(value) / len(value)) for key, value in scores.items()}
-        elif aggregation_method == "max":
-            scores = {key: (max(value)) for key, value in scores.items()}
-        elif aggregation_method == "min":
-            scores = {key: (min(value)) for key, value in scores.items()}
-        else:
-            raise ValueError("Incorrect 'aggregation_method' passed, only accepts ['avg', 'min, 'max']")
-        return scores
-
-    def on_epoch_end(self, args, state, control, model, tokenizer, **kwargs):
+        aggregation_functions = {
+            "avg": lambda x: sum(x) / len(x),
+            "max": max,
+            "min": min,
+        }
+        if self.aggregation_method not in aggregation_functions:
+            raise ValueError("Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']")
+        return {
+            key: aggregation_functions[self.aggregation_method](value) \
+                for key, value in scores.items()
+        }
+
+    def on_epoch_end(self, 
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
         """
-        Called at the end of each training epoch.
-
-        Args:
-            args: Training arguments.
-            state: Training state.
-            control: Training control.
-            model: The current model.
-            tokenizer: Tokenizer used for evaluation.
-            kwargs: Additional keyword arguments.
+        Event triggered at the end of each training epoch.
         """
-        scores = self._calculate_scores()
-        scores = self._aggregate_scores(self.aggregation_method, scores)
-        print(scores)
-
-    def on_train_end(self, args, state, control, model, tokenizer, **kwargs):
+        self.epoch_counter += 1
+        scores = self._calculate_metric_scores()
+        self.log_history.append(scores)
+        control.should_log = True
+        
+        return control
+
+    def on_log(self, 
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        """
+        Event triggered after logging the last logs.
         """
-        Called at the end of the training process.
+        if not control.should_training_stop:
+            state.log_history[-1].update(self.log_history[-1])
+            log_history = state.log_history
+
+            def generate_table():
+                new_table = Table()
+                cols = log_history[-1].keys()
+                for key in cols:
+                    new_table.add_column(key)
+                for row in log_history:
+                    new_table.add_row(*[str(value) for value in row.values()])
+                return new_table
+
+            with self.live:
+                self.live.console.clear()
+                self.live.update(generate_table(), refresh=True)
+        else:
+            pass
 
-        Args:
-            args: Training arguments.
-            state: Training state.
-            control: Training control.
-            model: The final model.
-            tokenizer: Tokenizer used for evaluation.
-            kwargs: Additional keyword arguments.
+    def on_train_end(self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        """
+        Event triggered at the end of model training.
         """
         print("---------TRAIN END---------")
-
-
-
-# REWRITE THIS IN A BETTER PROFESSIONAL WAY, MAKE DOCS AND ERRORS MORE PROFESSIONAL, MAKE CODE BETTER, OPTIMIZE THE CODE AS MUCH AS YOU CAN TO LOOK BETTER
\ No newline at end of file

From 08eb92022aa25ce0ff0d318fe62a49f9906b9a21 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Mon, 25 Dec 2023 19:55:11 +0530
Subject: [PATCH 05/74] FIX: fixed issues with progress table and added
 progress bar

---
 .../huggingface/deepeval_callback.py          | 40 ++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index aec929dc8..901be5625 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,5 +1,6 @@
 from typing import Union, List, Dict
 
+import tqdm
 from rich.console import Console
 from rich.table import Table
 from rich.live import Live
@@ -30,9 +31,12 @@ def __init__(
         evaluation_dataset: EvaluationDataset = None,
         tokenizer_args: Dict = None,
         aggregation_method: str = "avg",
-        trainer: Trainer = None
+        trainer: Trainer = None,
+        show_table: bool = False
     ) -> None:
         super().__init__()
+        
+        self.show_table = show_table
         self.metrics = metrics
         self.evaluation_dataset = evaluation_dataset
         self.tokenizer_args = tokenizer_args
@@ -42,13 +46,15 @@ def __init__(
         self.epoch_counter = 0
         self.log_history = []
         self._initiate_rich_console()
-        
+
     def _initiate_rich_console(self) -> None:
         """
         Initiate rich console for progress tracking.
         """
-        console = Console()
-        self.live = Live(auto_refresh=True, console=console)
+        if self.show_table:
+            self.console = Console()
+            self.live = Live(auto_refresh=True, console=self.console)
+            self.live.start()
         self.trainer.remove_callback(ProgressCallback)
 
     def _calculate_metric_scores(self) -> Dict[str, List[float]]:
@@ -68,7 +74,7 @@ def _calculate_metric_scores(self) -> Dict[str, List[float]]:
                 metric_name = str(metric.__name__).lower().replace(" ", "_")
                 metric_score = metric.score
                 scores.setdefault(metric_name, []).append(metric_score)
-                
+        
         scores = self._aggregate_scores(scores)
         return scores
 
@@ -109,6 +115,7 @@ def on_epoch_end(self,
         self.epoch_counter += 1
         scores = self._calculate_metric_scores()
         self.log_history.append(scores)
+        self.progress.update(1)
         control.should_log = True
         
         return control
@@ -134,9 +141,8 @@ def generate_table():
                 for row in log_history:
                     new_table.add_row(*[str(value) for value in row.values()])
                 return new_table
-
-            with self.live:
-                self.live.console.clear()
+            
+            if self.show_table:
                 self.live.update(generate_table(), refresh=True)
         else:
             pass
@@ -150,4 +156,20 @@ def on_train_end(self,
         """
         Event triggered at the end of model training.
         """
-        print("---------TRAIN END---------")
+        self.progress.stop()
+        
+    def on_train_begin(self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        """
+        Event triggered at the begining of model training.
+        """
+        self.progress = tqdm(
+            total=self.trainer.args.num_train_epochs, 
+            desc="Epochs"
+        )
+        
+        

From 410b26872485eac6c8bb774a3fb1bebfc5479bdb Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Thu, 28 Dec 2023 02:25:55 +0530
Subject: [PATCH 06/74] FEAT: Added calc metric every x epoch and fixed some
 issues

---
 .../huggingface/deepeval_callback.py          | 49 +++++++++++--------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index 901be5625..ad367db2d 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,6 +1,6 @@
 from typing import Union, List, Dict
 
-import tqdm
+from tqdm import tqdm
 from rich.console import Console
 from rich.table import Table
 from rich.live import Live
@@ -32,11 +32,13 @@ def __init__(
         tokenizer_args: Dict = None,
         aggregation_method: str = "avg",
         trainer: Trainer = None,
-        show_table: bool = False
+        show_table: bool = False,
+        show_table_every: int = 1
     ) -> None:
         super().__init__()
         
         self.show_table = show_table
+        self.show_table_every = show_table_every
         self.metrics = metrics
         self.evaluation_dataset = evaluation_dataset
         self.tokenizer_args = tokenizer_args
@@ -44,7 +46,7 @@ def __init__(
         self.trainer = trainer
         
         self.epoch_counter = 0
-        self.log_history = []
+        self.deepeval_metric_history = []
         self._initiate_rich_console()
 
     def _initiate_rich_console(self) -> None:
@@ -102,6 +104,17 @@ def _aggregate_scores(self,
             key: aggregation_functions[self.aggregation_method](value) \
                 for key, value in scores.items()
         }
+        
+    def on_epoch_begin(self, 
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs
+    ):
+        """
+        Event triggered at the begining of each training epoch.
+        """
+        self.epoch_counter += 1
 
     def on_epoch_end(self, 
         args: TrainingArguments,
@@ -112,13 +125,9 @@ def on_epoch_end(self,
         """
         Event triggered at the end of each training epoch.
         """
-        self.epoch_counter += 1
-        scores = self._calculate_metric_scores()
-        self.log_history.append(scores)
         self.progress.update(1)
         control.should_log = True
-        
-        return control
+
 
     def on_log(self, 
         args: TrainingArguments,
@@ -129,23 +138,23 @@ def on_log(self,
         """
         Event triggered after logging the last logs.
         """
-        if not control.should_training_stop:
-            state.log_history[-1].update(self.log_history[-1])
-            log_history = state.log_history
+        if (
+            self.show_table 
+            and (self.epoch_counter % self.show_table_every == 0) 
+            and len(state.log_history) <= self.trainer.args.num_train_epochs
+        ):
+            scores = self._calculate_metric_scores()
+            self.deepeval_metric_history.append(scores)
+            self.deepeval_metric_history[-1].update(state.log_history[-1])
 
             def generate_table():
                 new_table = Table()
-                cols = log_history[-1].keys()
-                for key in cols:
+                for key in self.deepeval_metric_history[-1].keys():
                     new_table.add_column(key)
-                for row in log_history:
+                for row in self.deepeval_metric_history:
                     new_table.add_row(*[str(value) for value in row.values()])
                 return new_table
-            
-            if self.show_table:
-                self.live.update(generate_table(), refresh=True)
-        else:
-            pass
+            self.live.update(generate_table(), refresh=True)
 
     def on_train_end(self,
         args: TrainingArguments,
@@ -156,7 +165,7 @@ def on_train_end(self,
         """
         Event triggered at the end of model training.
         """
-        self.progress.stop()
+        self.progress.close()
         
     def on_train_begin(self,
         args: TrainingArguments,

From 1dcdc8d93722066e24a0fd1020a8f42c06bdd6e0 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Thu, 28 Dec 2023 03:07:14 +0530
Subject: [PATCH 07/74] CHORE: added progress_context while evaluation

---
 deepeval/callbacks/huggingface/deepeval_callback.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index ad367db2d..b4f5290e4 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -11,6 +11,7 @@
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
+from deepeval.progress_context import progress_context
 
 
 class DeepEvalCallback(TrainerCallback):
@@ -143,9 +144,10 @@ def on_log(self,
             and (self.epoch_counter % self.show_table_every == 0) 
             and len(state.log_history) <= self.trainer.args.num_train_epochs
         ):
-            scores = self._calculate_metric_scores()
-            self.deepeval_metric_history.append(scores)
-            self.deepeval_metric_history[-1].update(state.log_history[-1])
+            with progress_context("Evaluating testcases..."):
+                scores = self._calculate_metric_scores()
+                self.deepeval_metric_history.append(scores)
+                self.deepeval_metric_history[-1].update(state.log_history[-1])
 
             def generate_table():
                 new_table = Table()

From 004ef1cb5f5bef37bc2829bd7febc6942130d4e3 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Tue, 16 Jan 2024 22:45:13 +0530
Subject: [PATCH 08/74] FIX: fixed overlapping of progress-bars with tables

---
 .../huggingface/deepeval_callback.py          | 60 +++++++++++++------
 .../huggingface/deepeval_harness_callback.py  |  4 +-
 2 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index b4f5290e4..cad095238 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -4,6 +4,10 @@
 from rich.console import Console
 from rich.table import Table
 from rich.live import Live
+from rich.columns import Columns
+from rich.progress import Progress, BarColumn, \
+    SpinnerColumn, TextColumn
+
 from transformers import TrainerCallback, \
     ProgressCallback, Trainer, \
     TrainingArguments, TrainerState, TrainerControl
@@ -11,7 +15,6 @@
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
-from deepeval.progress_context import progress_context
 
 
 class DeepEvalCallback(TrainerCallback):
@@ -46,6 +49,7 @@ def __init__(
         self.aggregation_method = aggregation_method
         self.trainer = trainer
         
+        self.train_bar_started = False
         self.epoch_counter = 0
         self.deepeval_metric_history = []
         self._initiate_rich_console()
@@ -126,7 +130,7 @@ def on_epoch_end(self,
         """
         Event triggered at the end of each training epoch.
         """
-        self.progress.update(1)
+        
         control.should_log = True
 
 
@@ -139,24 +143,35 @@ def on_log(self,
         """
         Event triggered after logging the last logs.
         """
+        
+        if not self.train_bar_started:
+            self.progress.start()
+            self.train_bar_started = True
+            
         if (
-            self.show_table 
-            and (self.epoch_counter % self.show_table_every == 0) 
+            self.show_table
             and len(state.log_history) <= self.trainer.args.num_train_epochs
         ):
-            with progress_context("Evaluating testcases..."):
+            self.progress.update(self.progress_task, advance=1)
+            if self.epoch_counter % self.show_table_every == 0:
+                self.spinner.reset(self.spinner_task, description="[STATUS] Evaluating test-cases (might take up few minutes) ...")
+                
                 scores = self._calculate_metric_scores()
                 self.deepeval_metric_history.append(scores)
                 self.deepeval_metric_history[-1].update(state.log_history[-1])
-
-            def generate_table():
-                new_table = Table()
-                for key in self.deepeval_metric_history[-1].keys():
-                    new_table.add_column(key)
-                for row in self.deepeval_metric_history:
-                    new_table.add_row(*[str(value) for value in row.values()])
-                return new_table
-            self.live.update(generate_table(), refresh=True)
+                
+                self.spinner.reset(self.spinner_task, description="[STATUS] Training in Progress ...")
+
+                def generate_table():
+                    new_table = Table()
+                    cols = Columns([new_table,  self.spinner, self.progress], equal=True, expand=True)
+                    for key in self.deepeval_metric_history[-1].keys():
+                        new_table.add_column(key)
+                    for row in self.deepeval_metric_history:
+                        new_table.add_row(*[str(value) for value in row.values()])
+                    return cols
+                
+                self.live.update(generate_table(), refresh=True)
 
     def on_train_end(self,
         args: TrainingArguments,
@@ -167,7 +182,7 @@ def on_train_end(self,
         """
         Event triggered at the end of model training.
         """
-        self.progress.close()
+        self.progress.stop()
         
     def on_train_begin(self,
         args: TrainingArguments,
@@ -178,9 +193,16 @@ def on_train_begin(self,
         """
         Event triggered at the begining of model training.
         """
-        self.progress = tqdm(
-            total=self.trainer.args.num_train_epochs, 
-            desc="Epochs"
+        self.progress = Progress(
+            TextColumn("{task.description} [progress.percentage][{task.percentage:>3.1f}%]:", justify="right"),
+            BarColumn(),
+            TextColumn("[green][ {task.completed}/{task.total} epochs ]", justify="right"),
         )
+        self.progress_task = self.progress.add_task("Train Progress", total=self.trainer.args.num_train_epochs)
         
-        
+        self.spinner = Progress(
+            SpinnerColumn(),
+            TextColumn("{task.description}", justify="right"),
+            transient=True
+        )
+        self.spinner_task = self.spinner.add_task("[STATUS] Training in Progress ...", total=9999)
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/deepeval_harness_callback.py b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
index 52c1ff6f1..76d0a5418 100644
--- a/deepeval/callbacks/huggingface/deepeval_harness_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
@@ -11,4 +11,6 @@ class DeepEvalHarnessCallback(TrainerCallback):
     
     def __init__(self, experiments: Union[BaseEvaluationExperiment, List[BaseEvaluationExperiment]]):
         super().__init__()
-        self.experiments = experiments
\ No newline at end of file
+        self.experiments = experiments
+        
+        raise NotImplementedError("DeepEvalHarnessCallback is WIP")
\ No newline at end of file

From 46b400447bb1c38346f514624b084ef457339a41 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Tue, 16 Jan 2024 23:54:03 +0530
Subject: [PATCH 09/74] FIX: fixed flikering of inital progress bars and order
 of columns

---
 .../huggingface/deepeval_callback.py          | 21 ++++++++++++-------
 deepeval/callbacks/huggingface/utils.py       |  5 +++++
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index cad095238..3e2ffc85a 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -16,6 +16,8 @@
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
 
+from .utils import reorder
+
 
 class DeepEvalCallback(TrainerCallback):
     """
@@ -154,21 +156,22 @@ def on_log(self,
         ):
             self.progress.update(self.progress_task, advance=1)
             if self.epoch_counter % self.show_table_every == 0:
-                self.spinner.reset(self.spinner_task, description="[STATUS] Evaluating test-cases (might take up few minutes) ...")
+                self.spinner.reset(self.spinner_task, description="[STATUS] Evaluating test-cases (might take up few minutes)")
                 
                 scores = self._calculate_metric_scores()
                 self.deepeval_metric_history.append(scores)
                 self.deepeval_metric_history[-1].update(state.log_history[-1])
                 
-                self.spinner.reset(self.spinner_task, description="[STATUS] Training in Progress ...")
+                self.spinner.reset(self.spinner_task, description="[STATUS] Training in Progress")
 
                 def generate_table():
                     new_table = Table()
                     cols = Columns([new_table,  self.spinner, self.progress], equal=True, expand=True)
-                    for key in self.deepeval_metric_history[-1].keys():
+                    order = reorder(self.deepeval_metric_history[-1], )
+                    for key in order:
                         new_table.add_column(key)
                     for row in self.deepeval_metric_history:
-                        new_table.add_row(*[str(value) for value in row.values()])
+                        new_table.add_row(*[str(row[value]) for value in order])
                     return cols
                 
                 self.live.update(generate_table(), refresh=True)
@@ -194,15 +197,17 @@ def on_train_begin(self,
         Event triggered at the begining of model training.
         """
         self.progress = Progress(
-            TextColumn("{task.description} [progress.percentage][{task.percentage:>3.1f}%]:", justify="right"),
+            TextColumn("{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:", justify="right"),
             BarColumn(),
             TextColumn("[green][ {task.completed}/{task.total} epochs ]", justify="right"),
         )
         self.progress_task = self.progress.add_task("Train Progress", total=self.trainer.args.num_train_epochs)
         
         self.spinner = Progress(
-            SpinnerColumn(),
             TextColumn("{task.description}", justify="right"),
-            transient=True
+            SpinnerColumn(spinner_name="simpleDotsScrolling")
         )
-        self.spinner_task = self.spinner.add_task("[STATUS] Training in Progress ...", total=9999)
\ No newline at end of file
+        self.spinner_task = self.spinner.add_task("[blue][STATUS] [white]Training in Progress")
+        
+        initial_columns = Columns([Table(), self.spinner, self.progress], equal=True, expand=True)
+        self.live.update(initial_columns, refresh=True)
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/callbacks/huggingface/utils.py
index e69de29bb..ed676460c 100644
--- a/deepeval/callbacks/huggingface/utils.py
+++ b/deepeval/callbacks/huggingface/utils.py
@@ -0,0 +1,5 @@
+def reorder(dic):
+    order = ["epoch", "step", "loss", "learning_rate"]
+    order.extend([key for key in dic.keys() if key not in order])
+    
+    return order
\ No newline at end of file

From a1b84316de9e8e06c22f65d5ee8c91b03bdccc45 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Wed, 17 Jan 2024 00:42:17 +0530
Subject: [PATCH 10/74] CHORE: code reformat

---
 .../huggingface/deepeval_callback.py          | 79 +++++++++++--------
 deepeval/callbacks/huggingface/utils.py       |  4 +-
 2 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index 3e2ffc85a..deb7fafef 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,6 +1,5 @@
 from typing import Union, List, Dict
 
-from tqdm import tqdm
 from rich.console import Console
 from rich.table import Table
 from rich.live import Live
@@ -16,7 +15,7 @@
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
 
-from .utils import reorder
+from .utils import get_column_order
 
 
 class DeepEvalCallback(TrainerCallback):
@@ -51,6 +50,21 @@ def __init__(
         self.aggregation_method = aggregation_method
         self.trainer = trainer
         
+        self.task_descriptions = {
+            "training": "[blue][STATUS] [white]Training in Progress",
+            "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
+            "training_end": "[blue][STATUS] [white]Training Ended",
+        }
+        self.progress_bar_columns = [
+            TextColumn("{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:", justify="right"),
+            BarColumn(),
+            TextColumn("[green][ {task.completed}/{task.total} epochs ]", justify="right"),
+        ]
+        self.spinner_columns = [
+            TextColumn("{task.description}", justify="right"),
+            SpinnerColumn(spinner_name="simpleDotsScrolling")
+        ]
+        
         self.train_bar_started = False
         self.epoch_counter = 0
         self.deepeval_metric_history = []
@@ -132,10 +146,8 @@ def on_epoch_end(self,
         """
         Event triggered at the end of each training epoch.
         """
-        
         control.should_log = True
 
-
     def on_log(self, 
         args: TrainingArguments,
         state: TrainerState,
@@ -145,36 +157,41 @@ def on_log(self,
         """
         Event triggered after logging the last logs.
         """
-        
         if not self.train_bar_started:
             self.progress.start()
             self.train_bar_started = True
             
-        if (
-            self.show_table
-            and len(state.log_history) <= self.trainer.args.num_train_epochs
-        ):
+        if (self.show_table and len(state.log_history) <= self.trainer.args.num_train_epochs):
             self.progress.update(self.progress_task, advance=1)
+            
             if self.epoch_counter % self.show_table_every == 0:
-                self.spinner.reset(self.spinner_task, description="[STATUS] Evaluating test-cases (might take up few minutes)")
+                self.spinner.reset(self.spinner_task, description=self.task_descriptions["evaluate"])
                 
                 scores = self._calculate_metric_scores()
                 self.deepeval_metric_history.append(scores)
                 self.deepeval_metric_history[-1].update(state.log_history[-1])
                 
-                self.spinner.reset(self.spinner_task, description="[STATUS] Training in Progress")
-
-                def generate_table():
-                    new_table = Table()
-                    cols = Columns([new_table,  self.spinner, self.progress], equal=True, expand=True)
-                    order = reorder(self.deepeval_metric_history[-1], )
-                    for key in order:
-                        new_table.add_column(key)
-                    for row in self.deepeval_metric_history:
-                        new_table.add_row(*[str(row[value]) for value in order])
-                    return cols
+                self.spinner.reset(self.spinner_task, description=self.task_descriptions["training"])
+                self.live.update(self._generate_table(), refresh=True)
                 
-                self.live.update(generate_table(), refresh=True)
+    def _generate_table(self):
+        """
+        Generates table, along with progress bars
+
+        Returns:
+            rich.Columns: contains table and 2 progress bars
+        """
+        new_table = Table()
+        cols = Columns([new_table,  self.spinner, self.progress], equal=True, expand=True)
+        order = get_column_order(self.deepeval_metric_history[-1])
+        
+        for key in order:
+            new_table.add_column(key)
+            
+        for row in self.deepeval_metric_history:
+            new_table.add_row(*[str(row[value]) for value in order])
+            
+        return cols
 
     def on_train_end(self,
         args: TrainingArguments,
@@ -185,7 +202,8 @@ def on_train_end(self,
         """
         Event triggered at the end of model training.
         """
-        self.progress.stop()
+        self.spinner.reset(self.spinner_task, description=self.task_descriptions["training_end"])
+        self.live.stop()
         
     def on_train_begin(self,
         args: TrainingArguments,
@@ -196,18 +214,11 @@ def on_train_begin(self,
         """
         Event triggered at the begining of model training.
         """
-        self.progress = Progress(
-            TextColumn("{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:", justify="right"),
-            BarColumn(),
-            TextColumn("[green][ {task.completed}/{task.total} epochs ]", justify="right"),
-        )
-        self.progress_task = self.progress.add_task("Train Progress", total=self.trainer.args.num_train_epochs)
+        self.progress = Progress(*self.progress_bar_columns)
+        self.spinner = Progress(*self.spinner_columns)
         
-        self.spinner = Progress(
-            TextColumn("{task.description}", justify="right"),
-            SpinnerColumn(spinner_name="simpleDotsScrolling")
-        )
-        self.spinner_task = self.spinner.add_task("[blue][STATUS] [white]Training in Progress")
+        self.progress_task = self.progress.add_task("Train Progress", total=self.trainer.args.num_train_epochs)
+        self.spinner_task = self.spinner.add_task(self.task_descriptions["training"])
         
         initial_columns = Columns([Table(), self.spinner, self.progress], equal=True, expand=True)
         self.live.update(initial_columns, refresh=True)
\ No newline at end of file
diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/callbacks/huggingface/utils.py
index ed676460c..76896b8ab 100644
--- a/deepeval/callbacks/huggingface/utils.py
+++ b/deepeval/callbacks/huggingface/utils.py
@@ -1,5 +1,5 @@
-def reorder(dic):
+def get_column_order(scores: dict):
     order = ["epoch", "step", "loss", "learning_rate"]
-    order.extend([key for key in dic.keys() if key not in order])
+    order.extend([key for key in scores.keys() if key not in order])
     
     return order
\ No newline at end of file

From 210df2e7cb74a2a9291f3f528281e7d971bafd77 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Wed, 17 Jan 2024 00:55:27 +0530
Subject: [PATCH 11/74] FEAT: Added test_callback script for testing

---
 tests/test_callbacks.py | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 tests/test_callbacks.py

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
new file mode 100644
index 000000000..58e25f20e
--- /dev/null
+++ b/tests/test_callbacks.py
@@ -0,0 +1,3 @@
+"""Test for callbacks
+"""
+

From d0afde7164a78ce824952eee0d15070b622fa3cf Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Wed, 17 Jan 2024 08:56:27 +0530
Subject: [PATCH 12/74] FEAT: Added test_callback script for testing

---
 tests/test_callbacks.py | 93 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 58e25f20e..0f2900fc6 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -1,3 +1,96 @@
 """Test for callbacks
 """
 
+from transformers import Trainer, TrainingArguments
+from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import DataCollatorForLanguageModeling
+
+import datasets
+import json
+
+from deepeval.callbacks.huggingface import DeepEvalCallback
+from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+from deepeval.dataset import EvaluationDataset
+
+# load dataset
+f = open(r"D:\deepeval-callback\deepeval\build\ra_top_1000_data_set.json", 'r', encoding='utf-8').read()
+data = json.loads(f)
+final_data = {'text': [x['bio'] for x in data][:200]}
+dataset = datasets.Dataset.from_dict(final_data)
+
+# initialize tokenizer
+tokenizer = AutoTokenizer.from_pretrained(
+    "EleutherAI/gpt-neo-125M",
+    bos_token='<|startoftext|>', 
+    eos_token='<|endoftext|>', 
+    pad_token='<|pad|>'
+)
+
+# initalize model
+model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")
+model.resize_token_embeddings(len(tokenizer))
+
+# create tokenized dataset
+tokenizer_args = {
+    "return_tensors":"pt", 
+    "max_length": 64, 
+    "padding": "max_length", 
+    "truncation": True
+}
+
+def tokenize_function(examples):
+    return tokenizer(examples["text"], **tokenizer_args)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm=False
+)
+
+# create LLMTestCases
+first_test_case = LLMTestCase(
+    input="What if these shoes don't fit?",
+    actual_output="We offer a 30-day full refund at no extra costs.", 
+    context=["All customers are eligible for a 30 day full refund at no extra costs."]
+)
+second_test_case = LLMTestCase(
+    input="What if these shoes don't fit?", 
+    actual_output="We also sell 20 gallons of pepsi", 
+    context=["All customers are eligible for a 30 day full refund at no extra costs."]
+)
+
+# create deepeval metrics list
+dataset = EvaluationDataset(test_cases=[first_test_case, second_test_case])
+hallucination_metric = HallucinationMetric(minimum_score=0.3)
+answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
+metrics = [hallucination_metric, answer_relevancy_metric]
+
+# initalize training_args
+training_args = TrainingArguments(
+    output_dir="./gpt2-fine-tuned",
+    overwrite_output_dir=True,
+    num_train_epochs=10,
+    per_device_train_batch_size=8
+)
+
+# initalize trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    data_collator=data_collator,
+    train_dataset=tokenized_datasets
+)
+
+# initalize DeepEvalCallback
+callback = DeepEvalCallback(
+    metrics=metrics, 
+    evaluation_dataset=dataset, 
+    tokenizer_args=tokenizer_args,
+    trainer=trainer,
+    show_table=True,
+    show_table_every=1
+)
+trainer.add_callback(callback)
+trainer.train()
\ No newline at end of file

From c213f9b1fed63aaa23dc5ed968add18b85964034 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Wed, 17 Jan 2024 12:37:55 +0530
Subject: [PATCH 13/74] CHORE: code lint

---
 deepeval/callbacks/huggingface/__init__.py    |   4 +-
 .../huggingface/deepeval_callback.py          | 130 +++++++++++-------
 .../huggingface/deepeval_harness_callback.py  |  13 +-
 deepeval/callbacks/huggingface/utils.py       |   4 +-
 tests/test_callbacks.py                       |  59 ++++----
 5 files changed, 132 insertions(+), 78 deletions(-)

diff --git a/deepeval/callbacks/huggingface/__init__.py b/deepeval/callbacks/huggingface/__init__.py
index 664d17cf8..3b63dc52d 100644
--- a/deepeval/callbacks/huggingface/__init__.py
+++ b/deepeval/callbacks/huggingface/__init__.py
@@ -1,2 +1,4 @@
 from deepeval.callbacks.huggingface.deepeval_callback import DeepEvalCallback
-# from deepeval.callbacks.huggingface.deepeval_harness_callback import DeepEvalHarnessCallback
\ No newline at end of file
+from deepeval.callbacks.huggingface.deepeval_harness_callback import (
+    DeepEvalHarnessCallback,
+)
diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index deb7fafef..64449629f 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -4,13 +4,17 @@
 from rich.table import Table
 from rich.live import Live
 from rich.columns import Columns
-from rich.progress import Progress, BarColumn, \
-    SpinnerColumn, TextColumn
+from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn
+
+from transformers import (
+    TrainerCallback,
+    ProgressCallback,
+    Trainer,
+    TrainingArguments,
+    TrainerState,
+    TrainerControl,
+)
 
-from transformers import TrainerCallback, \
-    ProgressCallback, Trainer, \
-    TrainingArguments, TrainerState, TrainerControl
-    
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
 from deepeval.evaluate import execute_test
@@ -38,10 +42,10 @@ def __init__(
         aggregation_method: str = "avg",
         trainer: Trainer = None,
         show_table: bool = False,
-        show_table_every: int = 1
+        show_table_every: int = 1,
     ) -> None:
         super().__init__()
-        
+
         self.show_table = show_table
         self.show_table_every = show_table_every
         self.metrics = metrics
@@ -49,22 +53,28 @@ def __init__(
         self.tokenizer_args = tokenizer_args
         self.aggregation_method = aggregation_method
         self.trainer = trainer
-        
+
         self.task_descriptions = {
             "training": "[blue][STATUS] [white]Training in Progress",
             "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
             "training_end": "[blue][STATUS] [white]Training Ended",
         }
         self.progress_bar_columns = [
-            TextColumn("{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:", justify="right"),
+            TextColumn(
+                "{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:",
+                justify="right",
+            ),
             BarColumn(),
-            TextColumn("[green][ {task.completed}/{task.total} epochs ]", justify="right"),
+            TextColumn(
+                "[green][ {task.completed}/{task.total} epochs ]",
+                justify="right",
+            ),
         ]
         self.spinner_columns = [
             TextColumn("{task.description}", justify="right"),
-            SpinnerColumn(spinner_name="simpleDotsScrolling")
+            SpinnerColumn(spinner_name="simpleDotsScrolling"),
         ]
-        
+
         self.train_bar_started = False
         self.epoch_counter = 0
         self.deepeval_metric_history = []
@@ -88,8 +98,7 @@ def _calculate_metric_scores(self) -> Dict[str, List[float]]:
             Dict[str, List[float]]: Metric scores for each test case.
         """
         test_results = execute_test(
-            test_cases=self.evaluation_dataset.test_cases,
-            metrics=self.metrics
+            test_cases=self.evaluation_dataset.test_cases, metrics=self.metrics
         )
         scores = {}
         for test in test_results:
@@ -97,12 +106,12 @@ def _calculate_metric_scores(self) -> Dict[str, List[float]]:
                 metric_name = str(metric.__name__).lower().replace(" ", "_")
                 metric_score = metric.score
                 scores.setdefault(metric_name, []).append(metric_score)
-        
+
         scores = self._aggregate_scores(scores)
         return scores
 
-    def _aggregate_scores(self,
-        scores: Dict[str, List[float]]
+    def _aggregate_scores(
+        self, scores: Dict[str, List[float]]
     ) -> Dict[str, float]:
         """
         Aggregate metric scores using the specified method.
@@ -120,13 +129,16 @@ def _aggregate_scores(self,
             "min": min,
         }
         if self.aggregation_method not in aggregation_functions:
-            raise ValueError("Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']")
+            raise ValueError(
+                "Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']"
+            )
         return {
-            key: aggregation_functions[self.aggregation_method](value) \
-                for key, value in scores.items()
+            key: aggregation_functions[self.aggregation_method](value)
+            for key, value in scores.items()
         }
-        
-    def on_epoch_begin(self, 
+
+    def on_epoch_begin(
+        self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
@@ -137,7 +149,8 @@ def on_epoch_begin(self,
         """
         self.epoch_counter += 1
 
-    def on_epoch_end(self, 
+    def on_epoch_end(
+        self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
@@ -148,7 +161,8 @@ def on_epoch_end(self,
         """
         control.should_log = True
 
-    def on_log(self, 
+    def on_log(
+        self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
@@ -160,20 +174,29 @@ def on_log(self,
         if not self.train_bar_started:
             self.progress.start()
             self.train_bar_started = True
-            
-        if (self.show_table and len(state.log_history) <= self.trainer.args.num_train_epochs):
+
+        if (
+            self.show_table
+            and len(state.log_history) <= self.trainer.args.num_train_epochs
+        ):
             self.progress.update(self.progress_task, advance=1)
-            
+
             if self.epoch_counter % self.show_table_every == 0:
-                self.spinner.reset(self.spinner_task, description=self.task_descriptions["evaluate"])
-                
+                self.spinner.reset(
+                    self.spinner_task,
+                    description=self.task_descriptions["evaluate"],
+                )
+
                 scores = self._calculate_metric_scores()
                 self.deepeval_metric_history.append(scores)
                 self.deepeval_metric_history[-1].update(state.log_history[-1])
-                
-                self.spinner.reset(self.spinner_task, description=self.task_descriptions["training"])
+
+                self.spinner.reset(
+                    self.spinner_task,
+                    description=self.task_descriptions["training"],
+                )
                 self.live.update(self._generate_table(), refresh=True)
-                
+
     def _generate_table(self):
         """
         Generates table, along with progress bars
@@ -182,18 +205,21 @@ def _generate_table(self):
             rich.Columns: contains table and 2 progress bars
         """
         new_table = Table()
-        cols = Columns([new_table,  self.spinner, self.progress], equal=True, expand=True)
+        cols = Columns(
+            [new_table, self.spinner, self.progress], equal=True, expand=True
+        )
         order = get_column_order(self.deepeval_metric_history[-1])
-        
+
         for key in order:
             new_table.add_column(key)
-            
+
         for row in self.deepeval_metric_history:
             new_table.add_row(*[str(row[value]) for value in order])
-            
+
         return cols
 
-    def on_train_end(self,
+    def on_train_end(
+        self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
@@ -202,10 +228,14 @@ def on_train_end(self,
         """
         Event triggered at the end of model training.
         """
-        self.spinner.reset(self.spinner_task, description=self.task_descriptions["training_end"])
+        self.spinner.reset(
+            self.spinner_task,
+            description=self.task_descriptions["training_end"],
+        )
         self.live.stop()
-        
-    def on_train_begin(self,
+
+    def on_train_begin(
+        self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
@@ -216,9 +246,15 @@ def on_train_begin(self,
         """
         self.progress = Progress(*self.progress_bar_columns)
         self.spinner = Progress(*self.spinner_columns)
-        
-        self.progress_task = self.progress.add_task("Train Progress", total=self.trainer.args.num_train_epochs)
-        self.spinner_task = self.spinner.add_task(self.task_descriptions["training"])
-        
-        initial_columns = Columns([Table(), self.spinner, self.progress], equal=True, expand=True)
-        self.live.update(initial_columns, refresh=True)
\ No newline at end of file
+
+        self.progress_task = self.progress.add_task(
+            "Train Progress", total=self.trainer.args.num_train_epochs
+        )
+        self.spinner_task = self.spinner.add_task(
+            self.task_descriptions["training"]
+        )
+
+        initial_columns = Columns(
+            [Table(), self.spinner, self.progress], equal=True, expand=True
+        )
+        self.live.update(initial_columns, refresh=True)
diff --git a/deepeval/callbacks/huggingface/deepeval_harness_callback.py b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
index 76d0a5418..09dd3c9c4 100644
--- a/deepeval/callbacks/huggingface/deepeval_harness_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
@@ -8,9 +8,14 @@ class DeepEvalHarnessCallback(TrainerCallback):
     """
     A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
     """
-    
-    def __init__(self, experiments: Union[BaseEvaluationExperiment, List[BaseEvaluationExperiment]]):
+
+    def __init__(
+        self,
+        experiments: Union[
+            BaseEvaluationExperiment, List[BaseEvaluationExperiment]
+        ],
+    ):
         super().__init__()
         self.experiments = experiments
-        
-        raise NotImplementedError("DeepEvalHarnessCallback is WIP")
\ No newline at end of file
+
+        raise NotImplementedError("DeepEvalHarnessCallback is WIP")
diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/callbacks/huggingface/utils.py
index 76896b8ab..5e2683b91 100644
--- a/deepeval/callbacks/huggingface/utils.py
+++ b/deepeval/callbacks/huggingface/utils.py
@@ -1,5 +1,5 @@
 def get_column_order(scores: dict):
     order = ["epoch", "step", "loss", "learning_rate"]
     order.extend([key for key in scores.keys() if key not in order])
-    
-    return order
\ No newline at end of file
+
+    return order
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 0f2900fc6..59c5eb768 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -8,24 +8,31 @@
 
 import datasets
 import json
+import os
 
 from deepeval.callbacks.huggingface import DeepEvalCallback
 from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
 from deepeval.test_case import LLMTestCase
 from deepeval.dataset import EvaluationDataset
 
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
+
 # load dataset
-f = open(r"D:\deepeval-callback\deepeval\build\ra_top_1000_data_set.json", 'r', encoding='utf-8').read()
+f = open(
+    r"D:\deepeval-callback\deepeval\build\ra_top_1000_data_set.json",
+    "r",
+    encoding="utf-8",
+).read()
 data = json.loads(f)
-final_data = {'text': [x['bio'] for x in data][:200]}
+final_data = {"text": [x["bio"] for x in data][:200]}
 dataset = datasets.Dataset.from_dict(final_data)
 
 # initialize tokenizer
 tokenizer = AutoTokenizer.from_pretrained(
     "EleutherAI/gpt-neo-125M",
-    bos_token='<|startoftext|>', 
-    eos_token='<|endoftext|>', 
-    pad_token='<|pad|>'
+    bos_token="<|startoftext|>",
+    eos_token="<|endoftext|>",
+    pad_token="<|pad|>",
 )
 
 # initalize model
@@ -34,31 +41,35 @@
 
 # create tokenized dataset
 tokenizer_args = {
-    "return_tensors":"pt", 
-    "max_length": 64, 
-    "padding": "max_length", 
-    "truncation": True
+    "return_tensors": "pt",
+    "max_length": 64,
+    "padding": "max_length",
+    "truncation": True,
 }
 
+
 def tokenize_function(examples):
     return tokenizer(examples["text"], **tokenizer_args)
+
+
 tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
-data_collator = DataCollatorForLanguageModeling(
-    tokenizer=tokenizer,
-    mlm=False
-)
+data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
 
 # create LLMTestCases
 first_test_case = LLMTestCase(
     input="What if these shoes don't fit?",
-    actual_output="We offer a 30-day full refund at no extra costs.", 
-    context=["All customers are eligible for a 30 day full refund at no extra costs."]
+    actual_output="We offer a 30-day full refund at no extra costs.",
+    context=[
+        "All customers are eligible for a 30 day full refund at no extra costs."
+    ],
 )
 second_test_case = LLMTestCase(
-    input="What if these shoes don't fit?", 
-    actual_output="We also sell 20 gallons of pepsi", 
-    context=["All customers are eligible for a 30 day full refund at no extra costs."]
+    input="What if these shoes don't fit?",
+    actual_output="We also sell 20 gallons of pepsi",
+    context=[
+        "All customers are eligible for a 30 day full refund at no extra costs."
+    ],
 )
 
 # create deepeval metrics list
@@ -72,7 +83,7 @@ def tokenize_function(examples):
     output_dir="./gpt2-fine-tuned",
     overwrite_output_dir=True,
     num_train_epochs=10,
-    per_device_train_batch_size=8
+    per_device_train_batch_size=8,
 )
 
 # initalize trainer
@@ -80,17 +91,17 @@ def tokenize_function(examples):
     model=model,
     args=training_args,
     data_collator=data_collator,
-    train_dataset=tokenized_datasets
+    train_dataset=tokenized_datasets,
 )
 
 # initalize DeepEvalCallback
 callback = DeepEvalCallback(
-    metrics=metrics, 
-    evaluation_dataset=dataset, 
+    metrics=metrics,
+    evaluation_dataset=dataset,
     tokenizer_args=tokenizer_args,
     trainer=trainer,
     show_table=True,
-    show_table_every=1
+    show_table_every=1,
 )
 trainer.add_callback(callback)
-trainer.train()
\ No newline at end of file
+trainer.train()

From 3b27e05425bf2fa88318d1749f665cbe9e031bdb Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Wed, 17 Jan 2024 23:27:17 +0530
Subject: [PATCH 14/74] CHORE: made callbacks compatible with latest pull

---
 deepeval/callbacks/huggingface/deepeval_callback.py | 13 ++++++++++---
 .../huggingface/deepeval_harness_callback.py        | 10 +++-------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index 64449629f..f51a994c3 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -5,6 +5,7 @@
 from rich.live import Live
 from rich.columns import Columns
 from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn
+from rich.text import Text
 
 from transformers import (
     TrainerCallback,
@@ -75,6 +76,8 @@ def __init__(
             SpinnerColumn(spinner_name="simpleDotsScrolling"),
         ]
 
+        self.empty_column = Text("\n")
+
         self.train_bar_started = False
         self.epoch_counter = 0
         self.deepeval_metric_history = []
@@ -206,7 +209,9 @@ def _generate_table(self):
         """
         new_table = Table()
         cols = Columns(
-            [new_table, self.spinner, self.progress], equal=True, expand=True
+            [new_table, self.spinner, self.progress, self.empty_column],
+            equal=True,
+            expand=True,
         )
         order = get_column_order(self.deepeval_metric_history[-1])
 
@@ -244,7 +249,7 @@ def on_train_begin(
         """
         Event triggered at the begining of model training.
         """
-        self.progress = Progress(*self.progress_bar_columns)
+        self.progress = Progress(*self.progress_bar_columns, auto_refresh=False)
         self.spinner = Progress(*self.spinner_columns)
 
         self.progress_task = self.progress.add_task(
@@ -255,6 +260,8 @@ def on_train_begin(
         )
 
         initial_columns = Columns(
-            [Table(), self.spinner, self.progress], equal=True, expand=True
+            [Table(), self.spinner, self.progress, self.empty_column],
+            equal=True,
+            expand=True,
         )
         self.live.update(initial_columns, refresh=True)
diff --git a/deepeval/callbacks/huggingface/deepeval_harness_callback.py b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
index 09dd3c9c4..bbb8ce9a5 100644
--- a/deepeval/callbacks/huggingface/deepeval_harness_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_harness_callback.py
@@ -1,7 +1,8 @@
 from typing import List, Union
 
 from transformers.trainer_callback import TrainerCallback
-from deepeval.experimental import BaseEvaluationExperiment
+
+# from deepeval.experimental import BaseEvaluationExperiment
 
 
 class DeepEvalHarnessCallback(TrainerCallback):
@@ -9,12 +10,7 @@ class DeepEvalHarnessCallback(TrainerCallback):
     A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
     """
 
-    def __init__(
-        self,
-        experiments: Union[
-            BaseEvaluationExperiment, List[BaseEvaluationExperiment]
-        ],
-    ):
+    def __init__(self, experiments):
         super().__init__()
         self.experiments = experiments
 

From 8b7e3bcef9cf7eae767945dca3576c452a68b54b Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sat, 20 Jan 2024 15:47:18 +0530
Subject: [PATCH 15/74] FEAT: Added list of Goldens as an optional parameter to
 EvaluationDataset

---
 deepeval/dataset/dataset.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index 7610c7588..66a9a2099 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -25,9 +25,13 @@ class EvaluationDataset:
     test_cases: List[LLMTestCase]
     goldens: List[Golden]
 
-    def __init__(self, test_cases: List[LLMTestCase] = []):
+    def __init__(
+        self,
+        goldens: Optional[List[Golden]],
+        test_cases: List[LLMTestCase] = [],
+    ):
         self.test_cases = test_cases
-        self.goldens = []
+        self.goldens = goldens
 
     def add_test_case(self, test_case: LLMTestCase):
         self.test_cases.append(test_case)

From d0771b1f51bbc0303559b9d2f82f07bf312116c4 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sat, 20 Jan 2024 23:17:10 +0530
Subject: [PATCH 16/74] FEAT: Added 'retrieval_context' as param for Golden

---
 deepeval/dataset/api.py   | 1 +
 deepeval/dataset/utils.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/deepeval/dataset/api.py b/deepeval/dataset/api.py
index a25cbcea6..c758dd3f6 100644
--- a/deepeval/dataset/api.py
+++ b/deepeval/dataset/api.py
@@ -7,6 +7,7 @@ class Golden(BaseModel):
     actual_output: Optional[str] = Field(None, alias="actualOutput")
     expected_output: Optional[str] = Field(None, alias="expectedOutput")
     context: Optional[list] = Field(None)
+    retrieval_context: Optional[list] = Field(None)
     additional_metadata: Optional[Dict] = Field(
         None, alias="additionalMetadata"
     )
diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py
index ef6ec40e5..7a9e039fc 100644
--- a/deepeval/dataset/utils.py
+++ b/deepeval/dataset/utils.py
@@ -26,6 +26,7 @@ def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
             actual_output=golden.actual_output,
             expected_output=golden.expected_output,
             context=golden.context,
+            retrieval_context=golden.retrieval_context,
         )
         test_cases.append(test_case)
     return test_cases

From f09d5317400a438b3e30a21cd70295e187563cb0 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sat, 20 Jan 2024 23:18:25 +0530
Subject: [PATCH 17/74] FEAT: Added support for Golden

---
 .../huggingface/deepeval_callback.py          | 39 ++++++++++++++++---
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index f51a994c3..6a8a656ef 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -18,6 +18,7 @@
 
 from deepeval.metrics import BaseMetric
 from deepeval.dataset import EvaluationDataset
+from deepeval.dataset.utils import convert_goldens_to_test_cases
 from deepeval.evaluate import execute_test
 
 from .utils import get_column_order
@@ -56,6 +57,7 @@ def __init__(
         self.trainer = trainer
 
         self.task_descriptions = {
+            "generating": "[blue][STATUS] [white]Generating output from model (might take up few minutes)",
             "training": "[blue][STATUS] [white]Training in Progress",
             "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
             "training_end": "[blue][STATUS] [white]Training Ended",
@@ -145,7 +147,7 @@ def on_epoch_begin(
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
-        **kwargs
+        **kwargs,
     ):
         """
         Event triggered at the begining of each training epoch.
@@ -157,19 +159,46 @@ def on_epoch_end(
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
-        **kwargs
+        **kwargs,
     ):
         """
         Event triggered at the end of each training epoch.
         """
         control.should_log = True
+        test_cases = self.generate_test_cases()
+        self.evaluation_dataset.test_cases = test_cases
+
+    def generate_test_cases(self):
+        model = self.trainer.model
+        tokenizer = self.trainer.tokenizer
+        self.spinner.reset(
+            self.spinner_task,
+            description=self.task_descriptions["generating"],
+        )
+        for golden in self.evaluation_dataset.goldens:
+            prompt = f"""{'CONTEXT: ' + str("; ".join(golden.context)) if golden.context else ''}
+                QUESTION: {golden.input}
+                ANSWER:"""
+
+            tokenized_output = tokenizer(prompt, **self.tokenizer_args)
+            input_ids = tokenized_output.input_ids
+            outputs = model.generate(input_ids)
+            decoded_output = tokenizer.decode(
+                outputs[0], skip_special_tokens=True
+            )
+            golden.actual_output = decoded_output
+
+        test_cases = convert_goldens_to_test_cases(
+            self.evaluation_dataset.goldens
+        )
+        return test_cases
 
     def on_log(
         self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
-        **kwargs
+        **kwargs,
     ):
         """
         Event triggered after logging the last logs.
@@ -228,7 +257,7 @@ def on_train_end(
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
-        **kwargs
+        **kwargs,
     ):
         """
         Event triggered at the end of model training.
@@ -244,7 +273,7 @@ def on_train_begin(
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
-        **kwargs
+        **kwargs,
     ):
         """
         Event triggered at the begining of model training.

From c06871ad5a4a647722c03f98679ff95c69255fea Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 21 Jan 2024 09:10:33 +0530
Subject: [PATCH 18/74] CHORE: Separated callback and rich code for better
 readability

---
 .../huggingface/deepeval_callback.py          | 134 +++++-------------
 deepeval/callbacks/huggingface/rich_manger.py | 109 ++++++++++++++
 deepeval/callbacks/huggingface/utils.py       |  24 ++++
 3 files changed, 165 insertions(+), 102 deletions(-)
 create mode 100644 deepeval/callbacks/huggingface/rich_manger.py

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index 6a8a656ef..bb4139b5e 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -1,12 +1,5 @@
 from typing import Union, List, Dict
 
-from rich.console import Console
-from rich.table import Table
-from rich.live import Live
-from rich.columns import Columns
-from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn
-from rich.text import Text
-
 from transformers import (
     TrainerCallback,
     ProgressCallback,
@@ -17,11 +10,11 @@
 )
 
 from deepeval.metrics import BaseMetric
-from deepeval.dataset import EvaluationDataset
-from deepeval.dataset.utils import convert_goldens_to_test_cases
 from deepeval.evaluate import execute_test
+from deepeval.dataset import EvaluationDataset
 
-from .utils import get_column_order
+from .utils import get_column_order, generate_test_cases
+from .rich_manger import RichManager
 
 
 class DeepEvalCallback(TrainerCallback):
@@ -62,37 +55,13 @@ def __init__(
             "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
             "training_end": "[blue][STATUS] [white]Training Ended",
         }
-        self.progress_bar_columns = [
-            TextColumn(
-                "{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:",
-                justify="right",
-            ),
-            BarColumn(),
-            TextColumn(
-                "[green][ {task.completed}/{task.total} epochs ]",
-                justify="right",
-            ),
-        ]
-        self.spinner_columns = [
-            TextColumn("{task.description}", justify="right"),
-            SpinnerColumn(spinner_name="simpleDotsScrolling"),
-        ]
-
-        self.empty_column = Text("\n")
 
         self.train_bar_started = False
         self.epoch_counter = 0
         self.deepeval_metric_history = []
-        self._initiate_rich_console()
 
-    def _initiate_rich_console(self) -> None:
-        """
-        Initiate rich console for progress tracking.
-        """
-        if self.show_table:
-            self.console = Console()
-            self.live = Live(auto_refresh=True, console=self.console)
-            self.live.start()
+        total_train_epochs = self.trainer.args.num_train_epochs
+        self.rich_manager = RichManager(show_table, total_train_epochs)
         self.trainer.remove_callback(ProgressCallback)
 
     def _calculate_metric_scores(self) -> Dict[str, List[float]]:
@@ -165,33 +134,16 @@ def on_epoch_end(
         Event triggered at the end of each training epoch.
         """
         control.should_log = True
-        test_cases = self.generate_test_cases()
-        self.evaluation_dataset.test_cases = test_cases
-
-    def generate_test_cases(self):
-        model = self.trainer.model
-        tokenizer = self.trainer.tokenizer
-        self.spinner.reset(
-            self.spinner_task,
-            description=self.task_descriptions["generating"],
+        self.rich_manager.change_spinner_text(
+            self.task_descriptions["generating"]
         )
-        for golden in self.evaluation_dataset.goldens:
-            prompt = f"""{'CONTEXT: ' + str("; ".join(golden.context)) if golden.context else ''}
-                QUESTION: {golden.input}
-                ANSWER:"""
-
-            tokenized_output = tokenizer(prompt, **self.tokenizer_args)
-            input_ids = tokenized_output.input_ids
-            outputs = model.generate(input_ids)
-            decoded_output = tokenizer.decode(
-                outputs[0], skip_special_tokens=True
-            )
-            golden.actual_output = decoded_output
-
-        test_cases = convert_goldens_to_test_cases(
-            self.evaluation_dataset.goldens
+        test_cases = generate_test_cases(
+            self.trainer.model,
+            self.trainer.tokenizer,
+            self.tokenizer_args,
+            self.evaluation_dataset,
         )
-        return test_cases
+        self.evaluation_dataset.test_cases = test_cases
 
     def on_log(
         self,
@@ -203,31 +155,26 @@ def on_log(
         """
         Event triggered after logging the last logs.
         """
-        if not self.train_bar_started:
-            self.progress.start()
-            self.train_bar_started = True
-
         if (
             self.show_table
             and len(state.log_history) <= self.trainer.args.num_train_epochs
         ):
-            self.progress.update(self.progress_task, advance=1)
+            self.rich_manager.advance_progress()
 
             if self.epoch_counter % self.show_table_every == 0:
-                self.spinner.reset(
-                    self.spinner_task,
-                    description=self.task_descriptions["evaluate"],
+                self.rich_manager.change_spinner_text(
+                    self.task_descriptions["evaluate"]
                 )
 
                 scores = self._calculate_metric_scores()
                 self.deepeval_metric_history.append(scores)
                 self.deepeval_metric_history[-1].update(state.log_history[-1])
 
-                self.spinner.reset(
-                    self.spinner_task,
-                    description=self.task_descriptions["training"],
+                self.rich_manager.change_spinner_text(
+                    self.task_descriptions["training"]
                 )
-                self.live.update(self._generate_table(), refresh=True)
+                columns = self._generate_table()
+                self.rich_manager.update(columns)
 
     def _generate_table(self):
         """
@@ -236,21 +183,17 @@ def _generate_table(self):
         Returns:
             rich.Columns: contains table and 2 progress bars
         """
-        new_table = Table()
-        cols = Columns(
-            [new_table, self.spinner, self.progress, self.empty_column],
-            equal=True,
-            expand=True,
-        )
+        column, table = self.rich_manager.create_column()
         order = get_column_order(self.deepeval_metric_history[-1])
 
-        for key in order:
-            new_table.add_column(key)
+        if self.show_table:
+            for key in order:
+                table.add_column(key)
 
-        for row in self.deepeval_metric_history:
-            new_table.add_row(*[str(row[value]) for value in order])
+            for row in self.deepeval_metric_history:
+                table.add_row(*[str(row[value]) for value in order])
 
-        return cols
+        return column
 
     def on_train_end(
         self,
@@ -262,11 +205,10 @@ def on_train_end(
         """
         Event triggered at the end of model training.
         """
-        self.spinner.reset(
-            self.spinner_task,
-            description=self.task_descriptions["training_end"],
+        self.rich_manager.change_spinner_text(
+            self.task_descriptions["training_end"]
         )
-        self.live.stop()
+        self.rich_manager.stop()
 
     def on_train_begin(
         self,
@@ -278,19 +220,7 @@ def on_train_begin(
         """
         Event triggered at the begining of model training.
         """
-        self.progress = Progress(*self.progress_bar_columns, auto_refresh=False)
-        self.spinner = Progress(*self.spinner_columns)
-
-        self.progress_task = self.progress.add_task(
-            "Train Progress", total=self.trainer.args.num_train_epochs
-        )
-        self.spinner_task = self.spinner.add_task(
+        self.rich_manager.start()
+        self.rich_manager.change_spinner_text(
             self.task_descriptions["training"]
         )
-
-        initial_columns = Columns(
-            [Table(), self.spinner, self.progress, self.empty_column],
-            equal=True,
-            expand=True,
-        )
-        self.live.update(initial_columns, refresh=True)
diff --git a/deepeval/callbacks/huggingface/rich_manger.py b/deepeval/callbacks/huggingface/rich_manger.py
new file mode 100644
index 000000000..7729419ac
--- /dev/null
+++ b/deepeval/callbacks/huggingface/rich_manger.py
@@ -0,0 +1,109 @@
+from typing import Union
+
+from rich.live import Live
+from rich.text import Text
+from rich.table import Table
+from rich.columns import Columns
+from rich.console import Console
+from rich.progress import Progress, BarColumn, SpinnerColumn, TextColumn
+
+
+class RichManager:
+    def __init__(self, show_table: bool, total_train_epochs: int) -> None:
+        """
+        Initialize RichManager.
+
+        Args:
+            show_table (bool): Flag to show or hide the table.
+            total_train_epochs (int): Total number of training epochs.
+        """
+        self.show_table = show_table
+        self.total_train_epochs = total_train_epochs
+        self.console = Console()
+        self.live = Live(auto_refresh=True, console=self.console)
+        self.train_bar_started = False
+
+        self.progress_bar_columns = [
+            TextColumn(
+                "{task.description} [progress.percentage][green][{task.percentage:>3.1f}%]:",
+                justify="right",
+            ),
+            BarColumn(),
+            TextColumn(
+                "[green][ {task.completed}/{task.total} epochs ]",
+                justify="right",
+            ),
+        ]
+        self.spinner_columns = [
+            TextColumn("{task.description}", justify="right"),
+            SpinnerColumn(spinner_name="simpleDotsScrolling"),
+        ]
+
+        self.empty_column = Text("\n")
+
+    def _initialize_progress_trackers(self) -> None:
+        """
+        Initialize progress trackers (progress and spinner columns).
+        """
+        self.progress = Progress(*self.progress_bar_columns, auto_refresh=False)
+        self.spinner = Progress(*self.spinner_columns)
+
+        self.progress_task = self.progress.add_task(
+            "Train Progress", total=self.total_train_epochs
+        )
+        self.spinner_task = self.spinner.add_task("Initializing")
+
+        column_list = [self.spinner, self.progress, self.empty_column]
+        column_list.insert(0, Table()) if self.show_table else None
+
+        column = Columns(column_list, equal=True, expand=True)
+        self.live.update(column, refresh=True)
+
+    def change_spinner_text(self, text: str) -> None:
+        """
+        Change the text displayed in the spinner.
+
+        Args:
+            text (str): Text to be displayed in the spinner.
+        """
+        self.spinner.reset(self.spinner_task, description=text)
+
+    def stop(self) -> None:
+        """Stop the live display."""
+        self.live.stop()
+
+    def start(self) -> None:
+        """Start the live display and initialize progress trackers."""
+        self.live.start()
+        self._initialize_progress_trackers()
+
+    def update(self, column: Columns) -> None:
+        """
+        Update the live display with a new column.
+
+        Args:
+            column (Columns): New column to be displayed.
+        """
+        self.live.update(column, refresh=True)
+
+    def create_column(self) -> Union[Columns, Table]:
+        """
+        Create a new column with an optional table.
+
+        Returns:
+            Tuple[Columns, Table]: Tuple containing the new column and an optional table.
+        """
+        new_table = Table()
+
+        column_list = [self.spinner, self.progress, self.empty_column]
+        column_list.insert(0, new_table) if self.show_table else None
+
+        column = Columns(column_list, equal=True, expand=True)
+        return column, new_table
+
+    def advance_progress(self) -> None:
+        """Advance the progress tracker."""
+        if not self.train_bar_started:
+            self.progress.start()
+            self.train_bar_started = True
+        self.progress.update(self.progress_task, advance=1)
diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/callbacks/huggingface/utils.py
index 5e2683b91..8c9b9306b 100644
--- a/deepeval/callbacks/huggingface/utils.py
+++ b/deepeval/callbacks/huggingface/utils.py
@@ -1,5 +1,29 @@
+from deepeval.test_case import LLMTestCase
+from deepeval.dataset.utils import convert_goldens_to_test_cases
+from typing import List
+
+
 def get_column_order(scores: dict):
     order = ["epoch", "step", "loss", "learning_rate"]
     order.extend([key for key in scores.keys() if key not in order])
 
     return order
+
+
+def generate_test_cases(
+    model, tokenizer, tokenizer_args, evaluation_dataset
+) -> List[LLMTestCase]:
+    goldens = evaluation_dataset.goldens
+    for golden in goldens:
+        prompt = f"""{'CONTEXT: ' + str("; ".join(golden.context)) if golden.context else ''}
+                QUESTION: {golden.input}
+                ANSWER:"""
+
+        tokenized_output = tokenizer(prompt, **tokenizer_args)
+        input_ids = tokenized_output.input_ids
+        outputs = model.generate(input_ids)
+        decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        golden.actual_output = decoded_output
+
+    test_cases = convert_goldens_to_test_cases(evaluation_dataset.goldens)
+    return test_cases

From bf81870b09b32d3bf05350f025e3b89fa7408aed Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 21 Jan 2024 09:14:24 +0530
Subject: [PATCH 19/74] CHORE: Added docstrings to
 deepeval.callbacks.huggingface.utils

---
 deepeval/callbacks/huggingface/utils.py | 32 +++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/callbacks/huggingface/utils.py
index 8c9b9306b..a0b6a9c5b 100644
--- a/deepeval/callbacks/huggingface/utils.py
+++ b/deepeval/callbacks/huggingface/utils.py
@@ -1,18 +1,42 @@
 from deepeval.test_case import LLMTestCase
+from deepeval.dataset import EvaluationDataset
 from deepeval.dataset.utils import convert_goldens_to_test_cases
-from typing import List
+from typing import List, Dict
 
 
-def get_column_order(scores: dict):
+def get_column_order(scores: Dict) -> List[str]:
+    """
+    Determine the order of columns for displaying scores.
+
+    Args:
+        scores (Dict): Dictionary containing scores.
+
+    Returns:
+        List[str]: List of column names in the desired order.
+    """
     order = ["epoch", "step", "loss", "learning_rate"]
     order.extend([key for key in scores.keys() if key not in order])
-
     return order
 
 
 def generate_test_cases(
-    model, tokenizer, tokenizer_args, evaluation_dataset
+    model,
+    tokenizer,
+    tokenizer_args: Dict,
+    evaluation_dataset: EvaluationDataset,
 ) -> List[LLMTestCase]:
+    """
+    Generate test cases based on a language model.
+
+    Args:
+        model: The language model to generate outputs.
+        tokenizer: The tokenizer for processing prompts.
+        tokenizer_args (Dict): Arguments for the tokenizer.
+        evaluation_dataset (EvaluationDataset): The dataset containing Golden.
+
+    Returns:
+        List[LLMTestCase]: List of generated test cases.
+    """
     goldens = evaluation_dataset.goldens
     for golden in goldens:
         prompt = f"""{'CONTEXT: ' + str("; ".join(golden.context)) if golden.context else ''}

From ff2b3a73d21914da0ca5065e2210ab9cdb58ea8b Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 21 Jan 2024 09:18:46 +0530
Subject: [PATCH 20/74] CHORE: Spell corrections

---
 deepeval/callbacks/huggingface/deepeval_callback.py             | 2 +-
 .../callbacks/huggingface/{rich_manger.py => rich_manager.py}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename deepeval/callbacks/huggingface/{rich_manger.py => rich_manager.py} (100%)

diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/callbacks/huggingface/deepeval_callback.py
index bb4139b5e..b1397dd40 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/callbacks/huggingface/deepeval_callback.py
@@ -14,7 +14,7 @@
 from deepeval.dataset import EvaluationDataset
 
 from .utils import get_column_order, generate_test_cases
-from .rich_manger import RichManager
+from .rich_manager import RichManager
 
 
 class DeepEvalCallback(TrainerCallback):
diff --git a/deepeval/callbacks/huggingface/rich_manger.py b/deepeval/callbacks/huggingface/rich_manager.py
similarity index 100%
rename from deepeval/callbacks/huggingface/rich_manger.py
rename to deepeval/callbacks/huggingface/rich_manager.py

From ea46dd6317a963c70ce68b55021017eab0d26f49 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 21 Jan 2024 04:18:53 -0800
Subject: [PATCH 21/74] Rename bias and toxicity

---
 deepeval/metrics/__init__.py         |  4 +-
 deepeval/metrics/bias.py             | 45 ++++++++++++++++
 deepeval/metrics/non_toxic_metric.py | 71 -------------------------
 deepeval/metrics/toxicity.py         | 40 ++++++++++++++
 deepeval/metrics/unbias_metric.py    | 78 ----------------------------
 docs/docs/metrics-bias.mdx           | 21 +++-----
 docs/docs/metrics-toxicity.mdx       | 21 +++-----
 tests/test_bias.py                   |  4 +-
 tests/test_toxic.py                  |  8 +--
 9 files changed, 107 insertions(+), 185 deletions(-)
 create mode 100644 deepeval/metrics/bias.py
 delete mode 100644 deepeval/metrics/non_toxic_metric.py
 create mode 100644 deepeval/metrics/toxicity.py
 delete mode 100644 deepeval/metrics/unbias_metric.py

diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
index 2f3b8f3cc..ccb188b2e 100644
--- a/deepeval/metrics/__init__.py
+++ b/deepeval/metrics/__init__.py
@@ -25,6 +25,6 @@
 #     RAGASCoherenceMetric as CoherenceMetric,
 #     RAGASMaliciousnessMetric as MaliciousnessMetric,
 # )
-from .unbias_metric import UnBiasedMetric
-from .non_toxic_metric import NonToxicMetric
+from .bias import BiasMetric
+from .toxicity import ToxicityMetric
 from .hallucination_metric import HallucinationMetric
diff --git a/deepeval/metrics/bias.py b/deepeval/metrics/bias.py
new file mode 100644
index 000000000..1f2531f5e
--- /dev/null
+++ b/deepeval/metrics/bias.py
@@ -0,0 +1,45 @@
+"""Metric for bias classifier - using the same min score subtraction methodology as the toxic classifier 
+Rationale for bias classifier is described here https://arxiv.org/pdf/2208.05777.pdf
+1 - Not Biased
+0 - Bias
+"""
+
+from typing import Optional, List
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.scorer import Scorer
+
+
+class BiasMetric(BaseMetric):
+    def __init__(
+        self,
+        model_name: str = "original",
+        threshold: float = 0.5,
+    ):  # see paper for rationale https://arxiv.org/pdf/2208.05777.pdf
+        self.model_name = model_name
+        self.threshold = threshold
+
+    def measure(self, test_case: LLMTestCase):
+        if test_case.input is None or test_case.actual_output:
+            raise ValueError("Input or actual output cannot be None")
+
+        result = Scorer.neural_bias_score(
+            test_case.actual_output, model=self.model_name
+        )
+        if result[0]["label"] == "Biased":
+            bias_score = 0.5 + (result[0]["score"] / 2)
+        else:
+            bias_score = 0.5 - (result[0]["score"] / 2)
+
+        self.success = bias_score <= self.threshold
+        self.score = bias_score
+
+        return self.score
+
+    def is_successful(self) -> bool:
+        self.success = self.score <= self.threshold
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Bias"
diff --git a/deepeval/metrics/non_toxic_metric.py b/deepeval/metrics/non_toxic_metric.py
deleted file mode 100644
index 20c9f78f2..000000000
--- a/deepeval/metrics/non_toxic_metric.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""Metric for toxic classifier. 
-1 - Healthy
-0 - Toxic
-"""
-from typing import List
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.metrics.base_metric import BaseMetric
-from deepeval.scorer import Scorer
-
-
-class NonToxicMetric(BaseMetric):
-    def __init__(
-        self,
-        evaluation_params: List[LLMTestCaseParams],
-        model_name: str = "original",
-        threshold: float = 0.5,
-    ):
-        if not evaluation_params:
-            raise ValueError("evaluation_params cannot be empty or None")
-
-        self.evaluation_params = evaluation_params
-        self.threshold, self.model_name = threshold, model_name
-
-    def __call__(self, test_case: LLMTestCase):
-        score = self.measure(test_case.actual_output)
-        score = score["min_score"]
-        return score
-
-    def measure(self, test_case: LLMTestCase):
-        # Check if each param in evaluation_params exists and is not None in test_case
-        for param in self.evaluation_params:
-            if (
-                not hasattr(test_case, param.value)
-                or getattr(test_case, param.value) is None
-            ):
-                raise ValueError(
-                    f"Test case is missing the required attribute: {param.value}"
-                )
-
-        total_scores = []
-
-        for param in self.evaluation_params:
-            text_to_evaluate = getattr(test_case, param.value)
-            _, results = Scorer.neural_toxic_score(
-                prediction=text_to_evaluate, model=self.model_name
-            )
-            # sample output
-            # {'toxicity': 0.98057544,
-            # 'severe_toxicity': 0.106649496,
-            # 'obscene': 0.32923067,
-            # 'threat': 0.018646086,
-            # 'insult': 0.7514423,
-            # 'identity_attack': 0.86643445}
-
-            score = 1 - results["toxicity"]
-            total_scores.append(score)
-
-        average_score = sum(total_scores) / len(total_scores)
-
-        # Check if the average score meets the minimum requirement
-        self.success = average_score >= self.threshold
-        self.score = average_score
-        return self.score
-
-    def is_successful(self) -> bool:
-        self.success = self.score >= self.threshold
-        return self.success
-
-    @property
-    def __name__(self):
-        return "Toxicity"
diff --git a/deepeval/metrics/toxicity.py b/deepeval/metrics/toxicity.py
new file mode 100644
index 000000000..a3cdafc55
--- /dev/null
+++ b/deepeval/metrics/toxicity.py
@@ -0,0 +1,40 @@
+from typing import List
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from deepeval.metrics.base_metric import BaseMetric
+from deepeval.scorer import Scorer
+
+
+class ToxicityMetric(BaseMetric):
+    def __init__(
+        self,
+        model_name: str = "original",
+        threshold: float = 0.5,
+    ):
+        self.threshold, self.model_name = threshold, model_name
+
+    def measure(self, test_case: LLMTestCase):
+        if test_case.input is None or test_case.actual_output:
+            raise ValueError("Input or actual output cannot be None")
+        _, results = Scorer.neural_toxic_score(
+            prediction=test_case.actual_output, model=self.model_name
+        )
+        # sample output
+        # {'toxicity': 0.98057544,
+        # 'severe_toxicity': 0.106649496,
+        # 'obscene': 0.32923067,
+        # 'threat': 0.018646086,
+        # 'insult': 0.7514423,
+        # 'identity_attack': 0.86643445}
+        toxicity_score = results["toxicity"]
+
+        self.success = toxicity_score <= self.threshold
+        self.score = toxicity_score
+        return self.score
+
+    def is_successful(self) -> bool:
+        self.success = self.score <= self.threshold
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Toxicity"
diff --git a/deepeval/metrics/unbias_metric.py b/deepeval/metrics/unbias_metric.py
deleted file mode 100644
index fb137caab..000000000
--- a/deepeval/metrics/unbias_metric.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Metric for bias classifier - using the same min score subtraction methodology as the toxic classifier 
-Rationale for bias classifier is described here https://arxiv.org/pdf/2208.05777.pdf
-1 - Not Biased
-0 - Bias
-"""
-
-from typing import Optional, List
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-from deepeval.scorer import Scorer
-
-
-class UnBiasedMetric(BaseMetric):
-    def __init__(
-        self,
-        evaluation_params: List[LLMTestCaseParams],
-        model_name: str = "original",
-        threshold: float = 0.5,
-    ):  # see paper for rationale https://arxiv.org/pdf/2208.05777.pdf
-        if not evaluation_params:
-            raise ValueError("evaluation_params cannot be empty or None")
-
-        self.evaluation_params = evaluation_params
-        self.model_name = model_name
-        self.threshold = threshold
-
-    def __call__(self, output, expected_output, query: Optional[str] = "-"):
-        score = self.measure(output, expected_output)
-        success = score >= self.threshold
-        return score
-
-    def measure(self, test_case: LLMTestCase, return_all_scores: bool = False):
-        # Check if each param in evaluation_params exists and is not None in test_case
-        for param in self.evaluation_params:
-            if (
-                not hasattr(test_case, param.value)
-                or getattr(test_case, param.value) is None
-            ):
-                raise ValueError(
-                    f"Test case is missing the required attribute: {param.value}"
-                )
-
-        total_score = 0  # to accumulate scores for all evaluation params
-        all_results = (
-            []
-        )  # to accumulate all individual results if return_all_scores is True
-
-        for param in self.evaluation_params:
-            result = Scorer.neural_bias_score(
-                getattr(test_case, param.value), model=self.model_name
-            )
-            if return_all_scores:
-                all_results.append(result)
-
-            if result[0]["label"] == "Biased":
-                v = 0.5 - (result[0]["score"] / 2)
-            else:
-                v = 0.5 + (result[0]["score"] / 2)
-            total_score += v
-
-        # Calculate the average score
-        average_score = total_score / len(self.evaluation_params)
-
-        self.success = average_score > self.threshold
-        self.score = average_score
-
-        if return_all_scores:
-            return all_results
-
-        return average_score
-
-    def is_successful(self) -> bool:
-        self.success = self.score >= self.threshold
-        return self.success
-
-    @property
-    def __name__(self):
-        return "Unbiased Metric"
diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx
index b5d0d60ff..8dfb46e75 100644
--- a/docs/docs/metrics-bias.mdx
+++ b/docs/docs/metrics-bias.mdx
@@ -4,7 +4,7 @@ title: Bias
 sidebar_label: Bias
 ---
 
-The bias metric determines whether your LLM has gender, racial, or political bias in whatever parameters you want to evaluate it on. This can occur after fine-tuning a custom model from any RLHF or optimizations.
+The bias metric determines whether your LLM output contains gender, racial, or political bias. This can occur after fine-tuning a custom model from any RLHF or optimizations.
 
 :::info
 Bias in `deepeval` is a **referenceless** metric. This means the score calculated for parameters provided in your `LLMTestCase`, like the `actual_output`, is not dependent on anything other than the value of the parameter itself.
@@ -20,30 +20,23 @@ pip install Dbias
 
 ## Required Arguments
 
-To use the `UnBiasedMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`:
+To use the `BiasMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`:
 
 - `input`
 - `actual_output`
 
 ## Example
 
-Unlike other metrics you've encountered to far, the `UnBiasedMetric` requires an extra parameter named evaluation_params. This parameter is an array, containing elements of the type LLMTestCaseParams, and specifies the parameter(s) of a given LLMTestCase that will be assessed for toxicity. The `UnBiasedMetric` will compute a score based on the average bias of each individual component being evaluated.
-
 ```python
 from deepeval import evaluate
-from deepeval.metrics import UnBiasedMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-
-# Replace this with the actual output from your LLM application
-actual_output = "We offer a 30-day full refund at no extra cost."
+from deepeval.metrics import BiasMetric
+from deepeval.test_case import LLMTestCase
 
-metric = UnBiasedMetric(
-    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT],
-    threshold=0.5
-)
+metric = BiasMetric(threshold=0.5)
 test_case = LLMTestCase(
     input="What if these shoes don't fit?",
-    actual_output=actual_output,
+    # Replace this with the actual output from your LLM application
+    actual_output = "We offer a 30-day full refund at no extra cost."
 )
 
 metric.measure(test_case)
diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx
index 754a4f0fd..f772ae34b 100644
--- a/docs/docs/metrics-toxicity.mdx
+++ b/docs/docs/metrics-toxicity.mdx
@@ -4,7 +4,7 @@ title: Toxicity
 sidebar_label: Toxicity
 ---
 
-The toxicity metric is another **referenceless** metric that evaluates toxicness in your LLM's outputs. This is particularly useful for a fine-tuning use case.
+The toxicity metric is another **referenceless** metric that evaluates toxicness in your LLM outputs. This is particularly useful for a fine-tuning use case.
 
 ## Installation
 
@@ -16,30 +16,23 @@ pip install detoxify
 
 ## Required Arguments
 
-To use the `NonToxicMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`:
+To use the `ToxicityMetric`, you'll have to provide the following arguments when creating an `LLMTestCase`:
 
 - `input`
 - `actual_output`
 
 ## Example
 
-Also being a referenceless like `UnBiasedMetric`, the `NonToxicMetric` similarily requires an extra parameter named `evaluation_params`. The final score is the average of the toxicity scores computed for each individual component being evaluated.
-
 ```python
 from deepeval import evaluate
-from deepeval.metrics import NonToxicMetric
-from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-
-# Replace this with the actual output from your LLM application
-actual_output = "We offer a 30-day full refund at no extra cost."
+from deepeval.metrics import ToxicityMetric
+from deepeval.test_case import LLMTestCase
 
-metric = NonToxicMetric(
-    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
-    threshold=0.5
-)
+metric = ToxicityMetric(threshold=0.5)
 test_case = LLMTestCase(
     input="What if these shoes don't fit?",
-    actual_output=actual_output,
+    # Replace this with the actual output from your LLM application
+    actual_output = "We offer a 30-day full refund at no extra cost."
 )
 
 metric.measure(test_case)
diff --git a/tests/test_bias.py b/tests/test_bias.py
index b1d16cd5d..c800a82d0 100644
--- a/tests/test_bias.py
+++ b/tests/test_bias.py
@@ -4,13 +4,13 @@
 
 # import pytest
 # from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-# from deepeval.metrics import UnBiasedMetric
+# from deepeval.metrics import BiasMetric
 # from deepeval import assert_test
 
 # output = "Winners of the FIFA world cup were the French national football team"
 
 # TODO: uncomment once unbias fixed
 # def test_non_bias_metric():
-#     metric = UnBiasedMetric(evaluation_params=[LLMTestCaseParams.INPUT])
+#     metric = BiasMetric(evaluation_params=[LLMTestCaseParams.INPUT])
 #     test_case = LLMTestCase(input="placeholder", actual_output=output)
 #     assert_test(test_case, [metric])
diff --git a/tests/test_toxic.py b/tests/test_toxic.py
index 821fc7e72..523f0e2ca 100644
--- a/tests/test_toxic.py
+++ b/tests/test_toxic.py
@@ -4,14 +4,14 @@
 
 # import pytest
 # from deepeval.test_case import LLMTestCase, LLMTestCaseParams
-# from deepeval.metrics import NonToxicMetric
+# from deepeval.metrics import ToxicityMetric
 # from deepeval import assert_test
 
 # output = "Winners of the FIFA world cup were the French national football team"
 
 # TODO: uncomment once detoxify fixed
 # def test_non_toxic():
-#     metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.INPUT])
+#     metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.INPUT])
 #     test_case = LLMTestCase(
 #         input="Who won the FIFA World Cup in 2018?", actual_output="Don't know."
 #     )
@@ -20,12 +20,12 @@
 
 # def test_non_toxic_2():
 #     test_case = LLMTestCase(input="Placeholder", actual_output="You're stupid")
-#     metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT])
+#     metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT])
 #     with pytest.raises(AssertionError):
 #         assert_test(test_case, [metric])
 
 
 # def test_non_toxic_metric():
-#     metric = NonToxicMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT])
+#     metric = ToxicityMetric(evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT])
 #     test_case = LLMTestCase(input="placeholder", actual_output=output)
 #     assert_test(test_case, [metric])

From a1d2a676b6d4ceeabf0b95e089a6d23be9259f28 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 21 Jan 2024 04:22:22 -0800
Subject: [PATCH 22/74] fix docs

---
 docs/docs/metrics-bias.mdx     | 4 ++++
 docs/docs/metrics-toxicity.mdx | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx
index 8dfb46e75..4330665b9 100644
--- a/docs/docs/metrics-bias.mdx
+++ b/docs/docs/metrics-bias.mdx
@@ -45,3 +45,7 @@ print(metric.score)
 # or evaluate test cases in bulk
 evaluate([test_case], [metric])
 ```
+
+:::note
+Unlike other metrics you've seen so far, the `threshold` for the `BiasMetric` is instead a maxmium threshold.
+:::
diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx
index f772ae34b..c61026c44 100644
--- a/docs/docs/metrics-toxicity.mdx
+++ b/docs/docs/metrics-toxicity.mdx
@@ -41,3 +41,7 @@ print(metric.score)
 # or evaluate test cases in bulk
 evaluate([test_case], [metric])
 ```
+
+:::note
+Similar to the `BiasMetric`, the `threshold` in toxicity is a maxmium threshold.
+:::

From 4a4e8d11d2b9881b14c212e2862a2582d1bd2899 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 21 Jan 2024 04:24:10 -0800
Subject: [PATCH 23/74] fix docs

---
 docs/docs/metrics-bias.mdx     | 4 ----
 docs/docs/metrics-toxicity.mdx | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx
index 4330665b9..acad4a95b 100644
--- a/docs/docs/metrics-bias.mdx
+++ b/docs/docs/metrics-bias.mdx
@@ -28,7 +28,6 @@ To use the `BiasMetric`, you'll have to provide the following arguments when cre
 ## Example
 
 ```python
-from deepeval import evaluate
 from deepeval.metrics import BiasMetric
 from deepeval.test_case import LLMTestCase
 
@@ -41,9 +40,6 @@ test_case = LLMTestCase(
 
 metric.measure(test_case)
 print(metric.score)
-
-# or evaluate test cases in bulk
-evaluate([test_case], [metric])
 ```
 
 :::note
diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx
index c61026c44..84e46dfbc 100644
--- a/docs/docs/metrics-toxicity.mdx
+++ b/docs/docs/metrics-toxicity.mdx
@@ -24,7 +24,6 @@ To use the `ToxicityMetric`, you'll have to provide the following arguments when
 ## Example
 
 ```python
-from deepeval import evaluate
 from deepeval.metrics import ToxicityMetric
 from deepeval.test_case import LLMTestCase
 
@@ -37,9 +36,6 @@ test_case = LLMTestCase(
 
 metric.measure(test_case)
 print(metric.score)
-
-# or evaluate test cases in bulk
-evaluate([test_case], [metric])
 ```
 
 :::note

From 50958f5dc1790097afb43b90191899e8b1cf7469 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 21 Jan 2024 04:29:45 -0800
Subject: [PATCH 24/74] new release

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index 8e29af347..d3b45a647 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.49"
+__version__: str = "0.20.50"
diff --git a/pyproject.toml b/pyproject.toml
index e46e16a4f..f2bffc594 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.49"
+version = "0.20.50"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From 5cd7c1342b9893bc66896b8daea1ad0e954eea3d Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 21 Jan 2024 21:27:21 +0530
Subject: [PATCH 25/74] CHORE: Added new test code

Full seq2seq fine-tuning code for flan-t5-small on pubmed dataset
---
 tests/test_callbacks.py | 242 ++++++++++++++++++++++++----------------
 1 file changed, 144 insertions(+), 98 deletions(-)

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 59c5eb768..42b5ba9ed 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -1,107 +1,153 @@
 """Test for callbacks
 """
 
-from transformers import Trainer, TrainingArguments
-from transformers import AutoTokenizer
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers import DataCollatorForLanguageModeling
+from transformers import (
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    T5Tokenizer,
+    T5ForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+)
 
-import datasets
-import json
-import os
+from datasets import load_dataset
 
 from deepeval.callbacks.huggingface import DeepEvalCallback
 from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.dataset import EvaluationDataset
-
-os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
-
-# load dataset
-f = open(
-    r"D:\deepeval-callback\deepeval\build\ra_top_1000_data_set.json",
-    "r",
-    encoding="utf-8",
-).read()
-data = json.loads(f)
-final_data = {"text": [x["bio"] for x in data][:200]}
-dataset = datasets.Dataset.from_dict(final_data)
-
-# initialize tokenizer
-tokenizer = AutoTokenizer.from_pretrained(
-    "EleutherAI/gpt-neo-125M",
-    bos_token="<|startoftext|>",
-    eos_token="<|endoftext|>",
-    pad_token="<|pad|>",
-)
-
-# initalize model
-model = AutoModelForCausalLM.from_pretrained("roneneldan/TinyStories-1M")
-model.resize_token_embeddings(len(tokenizer))
-
-# create tokenized dataset
-tokenizer_args = {
-    "return_tensors": "pt",
-    "max_length": 64,
-    "padding": "max_length",
-    "truncation": True,
-}
-
-
-def tokenize_function(examples):
-    return tokenizer(examples["text"], **tokenizer_args)
-
+from deepeval.dataset import EvaluationDataset, Golden
 
-tokenized_datasets = dataset.map(tokenize_function, batched=True)
-
-data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-
-# create LLMTestCases
-first_test_case = LLMTestCase(
-    input="What if these shoes don't fit?",
-    actual_output="We offer a 30-day full refund at no extra costs.",
-    context=[
-        "All customers are eligible for a 30 day full refund at no extra costs."
-    ],
-)
-second_test_case = LLMTestCase(
-    input="What if these shoes don't fit?",
-    actual_output="We also sell 20 gallons of pepsi",
-    context=[
-        "All customers are eligible for a 30 day full refund at no extra costs."
-    ],
-)
-
-# create deepeval metrics list
-dataset = EvaluationDataset(test_cases=[first_test_case, second_test_case])
-hallucination_metric = HallucinationMetric(minimum_score=0.3)
-answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
-metrics = [hallucination_metric, answer_relevancy_metric]
-
-# initalize training_args
-training_args = TrainingArguments(
-    output_dir="./gpt2-fine-tuned",
-    overwrite_output_dir=True,
-    num_train_epochs=10,
-    per_device_train_batch_size=8,
-)
-
-# initalize trainer
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    data_collator=data_collator,
-    train_dataset=tokenized_datasets,
-)
+import os
 
-# initalize DeepEvalCallback
-callback = DeepEvalCallback(
-    metrics=metrics,
-    evaluation_dataset=dataset,
-    tokenizer_args=tokenizer_args,
-    trainer=trainer,
-    show_table=True,
-    show_table_every=1,
-)
-trainer.add_callback(callback)
-trainer.train()
+os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
+os.environ["OPENAI_API_KEY"] = "API-KEY"
+
+
+def create_prompt(row):
+    """Merge Context and Question into a single string"""
+    contexts = row["context"]["contexts"]
+    question = row["question"]
+    prompt = f"""{'CONTEXT: ' + str("; ".join(contexts)) if contexts else ''}
+            QUESTION: {question}
+            ANSWER:"""
+    return {"input": prompt, "response": row["long_answer"]}
+
+
+def prepare_dataset(tokenizer, tokenizer_args):
+    dataset = load_dataset("pubmed_qa", "pqa_labeled")
+    merged_dataset = dataset.map(
+        create_prompt,
+        remove_columns=[
+            "question",
+            "context",
+            "long_answer",
+            "pubid",
+            "final_decision",
+        ],
+    )
+
+    def tokenize_text(dataset, padding="max_length"):
+        model_input = tokenizer(dataset["input"], **tokenizer_args)
+        response = tokenizer(dataset["response"], **tokenizer_args)
+
+        if padding == "max_length":
+            response["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label]
+                for label in response["input_ids"]
+            ]
+
+        model_input["labels"] = response["input_ids"]
+        return model_input
+
+    tokenized_dataset = merged_dataset.map(
+        tokenize_text, remove_columns=["input", "response"]
+    )
+    tokenized_dataset = tokenized_dataset.map(
+        lambda x: {
+            "input_ids": x["input_ids"][0],
+            "labels": x["labels"][0],
+            "attention_mask": x["attention_mask"][0],
+        }
+    )
+    return dataset, merged_dataset, tokenized_dataset
+
+
+def create_deepeval_dataset(dataset, sample_size):
+    eval_dataset = [dataset[row] for row in range(5, 10)]
+    goldens = []
+    for row in eval_dataset:
+        golden = Golden(
+            input=row["question"],
+            expectedOutput=row["long_answer"],
+            context=row["context"]["contexts"],
+            retrieval_context=row["context"]["contexts"],
+        )
+        goldens.append(golden)
+
+    return EvaluationDataset(goldens=goldens)
+
+
+if __name__ == "__main__":
+    # initialize tokenizer
+    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
+
+    # initalize model
+    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
+    model.resize_token_embeddings(len(tokenizer))
+
+    # create tokenized dataset
+    tokenizer_args = {
+        "return_tensors": "pt",
+        "max_length": 128,
+        "padding": "max_length",
+        "truncation": True,
+        "padding": True,
+    }
+
+    dataset, merged_dataset, tokenized_dataset = prepare_dataset(
+        tokenizer, tokenizer_args
+    )
+
+    label_pad_token_id = -100
+    data_collator = DataCollatorForSeq2Seq(
+        tokenizer,
+        model=model,
+        label_pad_token_id=label_pad_token_id,
+        pad_to_multiple_of=8,
+    )
+
+    repository_id = f"flan-t5-small"
+
+    # Define training args
+    training_args = Seq2SeqTrainingArguments(
+        output_dir=repository_id,
+        overwrite_output_dir=True,
+        num_train_epochs=50,
+        per_device_train_batch_size=8,
+    )
+
+    # Create Trainer instance
+    trainer = Seq2SeqTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=tokenized_dataset["train"],
+    )
+
+    eval_dataset = create_deepeval_dataset(dataset["train"], sample_size=5)
+    hallucination_metric = HallucinationMetric(threshold=0.3)
+    answer_relevancy_metric = AnswerRelevancyMetric(
+        threshold=0.5, model="gpt-3.5-turbo"
+    )
+    metrics = [hallucination_metric, answer_relevancy_metric]
+
+    # initalize DeepEvalCallback
+    callback = DeepEvalCallback(
+        metrics=metrics,
+        evaluation_dataset=eval_dataset,
+        tokenizer_args=tokenizer_args,
+        trainer=trainer,
+        show_table=True,
+        show_table_every=1,
+    )
+    trainer.add_callback(callback)
+    trainer.train()

From 591e268ac4d85d07b5069822c16e13178d76f99c Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 22 Jan 2024 18:32:56 -0800
Subject: [PATCH 26/74] Fixed assertion on actual output

---
 deepeval/metrics/bias.py     | 2 +-
 deepeval/metrics/toxicity.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/metrics/bias.py b/deepeval/metrics/bias.py
index 1f2531f5e..91d1b6fe7 100644
--- a/deepeval/metrics/bias.py
+++ b/deepeval/metrics/bias.py
@@ -20,7 +20,7 @@ def __init__(
         self.threshold = threshold
 
     def measure(self, test_case: LLMTestCase):
-        if test_case.input is None or test_case.actual_output:
+        if test_case.input is None or test_case.actual_output is None:
             raise ValueError("Input or actual output cannot be None")
 
         result = Scorer.neural_bias_score(
diff --git a/deepeval/metrics/toxicity.py b/deepeval/metrics/toxicity.py
index a3cdafc55..ee6ec1b50 100644
--- a/deepeval/metrics/toxicity.py
+++ b/deepeval/metrics/toxicity.py
@@ -13,7 +13,7 @@ def __init__(
         self.threshold, self.model_name = threshold, model_name
 
     def measure(self, test_case: LLMTestCase):
-        if test_case.input is None or test_case.actual_output:
+        if test_case.input is None or test_case.actual_output is None:
             raise ValueError("Input or actual output cannot be None")
         _, results = Scorer.neural_toxic_score(
             prediction=test_case.actual_output, model=self.model_name

From 0d6720462be4761f80817c0a1ef24694185a869b Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 22 Jan 2024 18:40:49 -0800
Subject: [PATCH 27/74] new release

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index d3b45a647..0b0e92115 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.50"
+__version__: str = "0.20.51"
diff --git a/pyproject.toml b/pyproject.toml
index f2bffc594..c0c92e1b3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.50"
+version = "0.20.51"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From 03fd6c4215056efc90f74e289db2feff61161222 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Tue, 23 Jan 2024 10:37:12 +0530
Subject: [PATCH 28/74] FIX: Fixed sample_size code inside
 create_deepeval_dataset function

---
 tests/test_callbacks.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 42b5ba9ed..87efa45f9 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -16,6 +16,7 @@
 from deepeval.dataset import EvaluationDataset, Golden
 
 import os
+import random
 
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
 os.environ["OPENAI_API_KEY"] = "API-KEY"
@@ -71,14 +72,20 @@ def tokenize_text(dataset, padding="max_length"):
 
 
 def create_deepeval_dataset(dataset, sample_size):
-    eval_dataset = [dataset[row] for row in range(5, 10)]
+    total_length = len(dataset)
+    random_index_list = [
+        random.randint(0, total_length) for _ in range(sample_size)
+    ]
+    print(random_index_list)
+    eval_dataset = [dataset[row] for row in random_index_list]
     goldens = []
     for row in eval_dataset:
+        context = ["; ".join(row["context"]["contexts"])]
         golden = Golden(
             input=row["question"],
             expectedOutput=row["long_answer"],
-            context=row["context"]["contexts"],
-            retrieval_context=row["context"]["contexts"],
+            context=context,
+            retrieval_context=context,
         )
         goldens.append(golden)
 

From 3415c19c265975dd3d1a8d4b36822f639522f90d Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 01:35:56 -0800
Subject: [PATCH 29/74] Added default to golden

---
 deepeval/dataset/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index af4ad0c43..7d0b62558 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -28,8 +28,8 @@ class EvaluationDataset:
 
     def __init__(
         self,
-        goldens: Optional[List[Golden]],
-        test_cases: List[LLMTestCase] = [],
+        goldens: Optional[List[Golden]] = [],
+        test_cases: Optional[List[LLMTestCase]] = [],
     ):
         self.test_cases = test_cases
         self.goldens = goldens

From eeb79b12ef3363c31e5c08ea0a5d4711e8c9870e Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 03:36:45 -0800
Subject: [PATCH 30/74] added deployment option

---
 deepeval/test_run/hooks.py         | 1 -
 deepeval/test_run/test_run.py      | 6 ++++--
 tests/test_hallucination_metric.py | 5 +++++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/deepeval/test_run/hooks.py b/deepeval/test_run/hooks.py
index 5250ed128..f8f4f44f8 100644
--- a/deepeval/test_run/hooks.py
+++ b/deepeval/test_run/hooks.py
@@ -4,7 +4,6 @@
 def on_test_run_end(func):
     global on_test_run_end_hook
     on_test_run_end_hook = func
-
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
 
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 10d3cbd51..d05f9fef3 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -64,13 +64,13 @@ class TestRun(BaseModel):
         None,
         alias="testFile",
     )
+    deployment: Optional[bool] = Field(True)
     dict_test_cases: Dict[int, APITestCase] = Field(
         default_factory=dict,
     )
     test_cases: List[APITestCase] = Field(
         alias="testCases", default_factory=lambda: []
     )
-
     metric_scores: List[MetricScoreType] = Field(
         default_factory=lambda: [], alias="metricScores"
     )
@@ -162,7 +162,9 @@ def create_test_run(self, file_name: Optional[str] = None):
             testFile=file_name,
             testCases=[],
             metricScores=[],
-            configurations={},
+            configurations={}
+            # TODO: make this a flag
+            # deployment=True
         )
         self.set_test_run(test_run)
 
diff --git a/tests/test_hallucination_metric.py b/tests/test_hallucination_metric.py
index 13560d23c..05fb796d5 100644
--- a/tests/test_hallucination_metric.py
+++ b/tests/test_hallucination_metric.py
@@ -1,4 +1,5 @@
 import pytest
+import deepeval
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import HallucinationMetric
 from deepeval import assert_test
@@ -36,3 +37,7 @@ def test_hallucination_metric_3():
     )
     with pytest.raises(AssertionError):
         assert_test(test_case, [metric])
+
+@deepeval.on_test_run_end
+def function_to_be_called_after_test_run():
+    print("Test @@@@@@@@ finished!")
\ No newline at end of file

From b9274bef75b8a683ff20491b28d108b322485332 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 03:38:03 -0800
Subject: [PATCH 31/74] fix test

---
 tests/test_hallucination_metric.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/test_hallucination_metric.py b/tests/test_hallucination_metric.py
index 05fb796d5..1daf645e8 100644
--- a/tests/test_hallucination_metric.py
+++ b/tests/test_hallucination_metric.py
@@ -1,5 +1,4 @@
 import pytest
-import deepeval
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import HallucinationMetric
 from deepeval import assert_test
@@ -36,8 +35,4 @@ def test_hallucination_metric_3():
         context=["Python is a snake."],
     )
     with pytest.raises(AssertionError):
-        assert_test(test_case, [metric])
-
-@deepeval.on_test_run_end
-def function_to_be_called_after_test_run():
-    print("Test @@@@@@@@ finished!")
\ No newline at end of file
+        assert_test(test_case, [metric])
\ No newline at end of file

From 5d449cc8fa5af995f17eb2bb9b374c6d16644722 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 03:38:20 -0800
Subject: [PATCH 32/74] reformat

---
 deepeval/test_run/hooks.py         | 1 +
 tests/test_hallucination_metric.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/deepeval/test_run/hooks.py b/deepeval/test_run/hooks.py
index f8f4f44f8..5250ed128 100644
--- a/deepeval/test_run/hooks.py
+++ b/deepeval/test_run/hooks.py
@@ -4,6 +4,7 @@
 def on_test_run_end(func):
     global on_test_run_end_hook
     on_test_run_end_hook = func
+
     def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
 
diff --git a/tests/test_hallucination_metric.py b/tests/test_hallucination_metric.py
index 1daf645e8..13560d23c 100644
--- a/tests/test_hallucination_metric.py
+++ b/tests/test_hallucination_metric.py
@@ -35,4 +35,4 @@ def test_hallucination_metric_3():
         context=["Python is a snake."],
     )
     with pytest.raises(AssertionError):
-        assert_test(test_case, [metric])
\ No newline at end of file
+        assert_test(test_case, [metric])

From 334a23369eda628aebf3bb69668bc615921c5b29 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 05:49:05 -0800
Subject: [PATCH 33/74] Add pytest deployment flag

---
 deepeval/cli/test.py          |  7 +++++++
 deepeval/plugins/plugin.py    | 12 +++++++++++-
 deepeval/test_run/test_run.py |  9 +++++----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 7ea86e1bb..1d53613f2 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -47,6 +47,9 @@ def run(
         "-n",
         help="Number of processes to use with pytest",
     ),
+    deployment: bool = typer.Option(
+        False, "-d", "--deployment", help="Flag to indicate deployment"
+    ),
 ):
     """Run a test"""
     delete_file_if_exists(TEMP_FILE_NAME)
@@ -56,6 +59,10 @@ def run(
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
+    if deployment:
+        pytest_args.append("--deployment")
+    print(pytest_args)
+
     pytest_args.extend(
         [
             "--verbose" if verbose else "--quiet",
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 48aae2614..3814356ef 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -10,12 +10,22 @@ def pytest_sessionstart(session: pytest.Session):
     test_run_manager.save_to_disk = True
     try:
         test_run_manager.create_test_run(
-            session.config.getoption("file_or_dir")[0]
+            deployment=session.config.getoption("--deployment"),
+            file_name=session.config.getoption("file_or_dir")[0],
         )
     except:
         test_run_manager.create_test_run()
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--deployment",
+        action="store_true",
+        default=False,
+        help="Enable deployment mode",
+    )
+
+
 @pytest.hookimpl(tryfirst=True)
 def pytest_runtest_protocol(
     item: pytest.Item, nextitem: Optional[pytest.Item]
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index d05f9fef3..8c5ccc16d 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -157,14 +157,15 @@ def reset(self):
     def set_test_run(self, test_run: TestRun):
         self.test_run = test_run
 
-    def create_test_run(self, file_name: Optional[str] = None):
+    def create_test_run(
+        self, deployment: bool, file_name: Optional[str] = None
+    ):
         test_run = TestRun(
             testFile=file_name,
             testCases=[],
             metricScores=[],
-            configurations={}
-            # TODO: make this a flag
-            # deployment=True
+            configurations={},
+            deployment=deployment,
         )
         self.set_test_run(test_run)
 

From cd1df87bf50f1e2d3a803a3fa26cba3e563253d0 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 05:49:56 -0800
Subject: [PATCH 34/74] .

---
 deepeval/cli/test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 1d53613f2..58fd5b300 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -61,7 +61,6 @@ def run(
 
     if deployment:
         pytest_args.append("--deployment")
-    print(pytest_args)
 
     pytest_args.extend(
         [

From e082fe6af5431eb30de11782838fe99017d2693d Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 05:51:29 -0800
Subject: [PATCH 35/74] .

---
 deepeval/test_run/test_run.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 8c5ccc16d..3f5474deb 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -158,7 +158,9 @@ def set_test_run(self, test_run: TestRun):
         self.test_run = test_run
 
     def create_test_run(
-        self, deployment: bool, file_name: Optional[str] = None
+        self,
+        deployment: Optional[bool] = False,
+        file_name: Optional[str] = None,
     ):
         test_run = TestRun(
             testFile=file_name,

From 2f55255ed92389c5e496ade7c86c0996f45e24d2 Mon Sep 17 00:00:00 2001
From: jeffometer <jeff@mechanical-orchard.com>
Date: Tue, 23 Jan 2024 16:43:03 -0500
Subject: [PATCH 36/74] Fix global mutable defaults in EvaluationDataset

Fixes confident-ai/deepeval/#430
---
 deepeval/dataset/dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index 7d0b62558..516d27114 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -28,11 +28,11 @@ class EvaluationDataset:
 
     def __init__(
         self,
-        goldens: Optional[List[Golden]] = [],
-        test_cases: Optional[List[LLMTestCase]] = [],
+        goldens: Optional[List[Golden]] = None,
+        test_cases: Optional[List[LLMTestCase]] = None,
     ):
-        self.test_cases = test_cases
-        self.goldens = goldens
+        self.test_cases = test_cases or []
+        self.goldens = goldens or []
 
     def add_test_case(self, test_case: LLMTestCase):
         self.test_cases.append(test_case)

From 1b50b988d40bba913539df8e8b0e7ff939901270 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Tue, 23 Jan 2024 15:02:04 -0800
Subject: [PATCH 37/74] .

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index 0b0e92115..815f41ae5 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.51"
+__version__: str = "0.20.52"
diff --git a/pyproject.toml b/pyproject.toml
index c0c92e1b3..ca5894494 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.51"
+version = "0.20.52"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From 10bf854e9849944a2f63ca3b36d7c3f3a46bef53 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 10:44:42 -0800
Subject: [PATCH 38/74] Added docsearch

---
 docs/docusaurus.config.js | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/docusaurus.config.js b/docs/docusaurus.config.js
index e7566e0a1..c97fe6dfc 100644
--- a/docs/docusaurus.config.js
+++ b/docs/docusaurus.config.js
@@ -90,6 +90,12 @@ const config = {
           },
         ],
       },
+      algolia: {
+        appId: '7U9PQIW1ZA',
+        apiKey: 'fb799aeac8bcd0f6b9e0e233a385ad33',
+        indexName: 'confident-ai',
+        contextualSearch: true,
+      },
       colorMode: {
         defaultMode: 'dark',
         disableSwitch: false,

From 2e468024cd97cb739bbb7aa828e91279e3807e00 Mon Sep 17 00:00:00 2001
From: nicholasburka <nburka@mac.com>
Date: Wed, 24 Jan 2024 23:19:34 -0500
Subject: [PATCH 39/74] delete redundant "Toxicity"

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 3f69a1d01..f65632593 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,6 @@ Whether your application is implemented via RAG or fine-tuning, LangChain or Lla
   - Contextual Recall
   - Contextual Precision
   - RAGAS
-  - Toxicity
   - Hallucination
   - Toxicity
   - Bias

From 8322dbab9e834974ea444bc03779911a6014558e Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 20:32:37 -0800
Subject: [PATCH 40/74] New integrations

---
 deepeval/callbacks/__init__.py                |   0
 deepeval/callbacks/huggingface/__init__.py    |   4 -
 deepeval/integrations/harness/__init__.py     |   1 +
 .../harness/callback.py}                      |   0
 .../integrations/hugging_face/__init__.py     |   1 +
 .../hugging_face/callback.py}                 |   2 +-
 .../hugging_face}/rich_manager.py             |   0
 .../hugging_face}/utils.py                    |   0
 deepeval/integrations/llama_index/__init__.py |   9 +
 .../llama_index/callback.py}                  |   0
 .../integrations/llama_index/evaluators.py    | 295 ++++++++++++++++++
 .../llama_index/tests/test_evaluators.py      |  39 +++
 deepeval/integrations/llama_index/utils.py    |  10 +
 deepeval/metrics/summarization.py             |  56 ++--
 llama_test/chatbot.py                         |  10 +-
 tests/test_callbacks.py                       |   6 +-
 16 files changed, 397 insertions(+), 36 deletions(-)
 delete mode 100644 deepeval/callbacks/__init__.py
 delete mode 100644 deepeval/callbacks/huggingface/__init__.py
 create mode 100644 deepeval/integrations/harness/__init__.py
 rename deepeval/{callbacks/huggingface/deepeval_harness_callback.py => integrations/harness/callback.py} (100%)
 create mode 100644 deepeval/integrations/hugging_face/__init__.py
 rename deepeval/{callbacks/huggingface/deepeval_callback.py => integrations/hugging_face/callback.py} (99%)
 rename deepeval/{callbacks/huggingface => integrations/hugging_face}/rich_manager.py (100%)
 rename deepeval/{callbacks/huggingface => integrations/hugging_face}/utils.py (100%)
 create mode 100644 deepeval/integrations/llama_index/__init__.py
 rename deepeval/{tracing/integrations/llama_index.py => integrations/llama_index/callback.py} (100%)
 create mode 100644 deepeval/integrations/llama_index/evaluators.py
 create mode 100644 deepeval/integrations/llama_index/tests/test_evaluators.py
 create mode 100644 deepeval/integrations/llama_index/utils.py

diff --git a/deepeval/callbacks/__init__.py b/deepeval/callbacks/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/deepeval/callbacks/huggingface/__init__.py b/deepeval/callbacks/huggingface/__init__.py
deleted file mode 100644
index 3b63dc52d..000000000
--- a/deepeval/callbacks/huggingface/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from deepeval.callbacks.huggingface.deepeval_callback import DeepEvalCallback
-from deepeval.callbacks.huggingface.deepeval_harness_callback import (
-    DeepEvalHarnessCallback,
-)
diff --git a/deepeval/integrations/harness/__init__.py b/deepeval/integrations/harness/__init__.py
new file mode 100644
index 000000000..f5d330218
--- /dev/null
+++ b/deepeval/integrations/harness/__init__.py
@@ -0,0 +1 @@
+from deepeval.integrations.harness import DeepEvalHarnessCallback
diff --git a/deepeval/callbacks/huggingface/deepeval_harness_callback.py b/deepeval/integrations/harness/callback.py
similarity index 100%
rename from deepeval/callbacks/huggingface/deepeval_harness_callback.py
rename to deepeval/integrations/harness/callback.py
diff --git a/deepeval/integrations/hugging_face/__init__.py b/deepeval/integrations/hugging_face/__init__.py
new file mode 100644
index 000000000..33e444e94
--- /dev/null
+++ b/deepeval/integrations/hugging_face/__init__.py
@@ -0,0 +1 @@
+from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
diff --git a/deepeval/callbacks/huggingface/deepeval_callback.py b/deepeval/integrations/hugging_face/callback.py
similarity index 99%
rename from deepeval/callbacks/huggingface/deepeval_callback.py
rename to deepeval/integrations/hugging_face/callback.py
index b1397dd40..f18810bcd 100644
--- a/deepeval/callbacks/huggingface/deepeval_callback.py
+++ b/deepeval/integrations/hugging_face/callback.py
@@ -17,7 +17,7 @@
 from .rich_manager import RichManager
 
 
-class DeepEvalCallback(TrainerCallback):
+class DeepEvalHuggingFaceCallback(TrainerCallback):
     """
     Custom callback for deep evaluation during model training.
 
diff --git a/deepeval/callbacks/huggingface/rich_manager.py b/deepeval/integrations/hugging_face/rich_manager.py
similarity index 100%
rename from deepeval/callbacks/huggingface/rich_manager.py
rename to deepeval/integrations/hugging_face/rich_manager.py
diff --git a/deepeval/callbacks/huggingface/utils.py b/deepeval/integrations/hugging_face/utils.py
similarity index 100%
rename from deepeval/callbacks/huggingface/utils.py
rename to deepeval/integrations/hugging_face/utils.py
diff --git a/deepeval/integrations/llama_index/__init__.py b/deepeval/integrations/llama_index/__init__.py
new file mode 100644
index 000000000..d471bc234
--- /dev/null
+++ b/deepeval/integrations/llama_index/__init__.py
@@ -0,0 +1,9 @@
+from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler
+from deepeval.integrations.llama_index.evaluators import (
+    AnswerRelevancyEvaluator,
+    FaithfulnessEvaluator,
+    ContextualRelevancyEvaluator,
+    SummarizationEvaluator,
+    ToxicityEvaluator,
+    BiasEvaluator,
+)
diff --git a/deepeval/tracing/integrations/llama_index.py b/deepeval/integrations/llama_index/callback.py
similarity index 100%
rename from deepeval/tracing/integrations/llama_index.py
rename to deepeval/integrations/llama_index/callback.py
diff --git a/deepeval/integrations/llama_index/evaluators.py b/deepeval/integrations/llama_index/evaluators.py
new file mode 100644
index 000000000..60ca3163a
--- /dev/null
+++ b/deepeval/integrations/llama_index/evaluators.py
@@ -0,0 +1,295 @@
+import asyncio
+from typing import Optional, Sequence, Any
+from llama_index.evaluation.base import BaseEvaluator, EvaluationResult
+
+from deepeval.test_case import LLMTestCase
+from deepeval.metrics import (
+    AnswerRelevancyMetric,
+    FaithfulnessMetric,
+    SummarizationMetric,
+    ContextualRelevancyMetric,
+    BiasMetric,
+    ToxicityMetric,
+)
+from deepeval.integrations.llama_index.utils import conform_contexts_type
+
+
+class AnswerRelevancyEvaluator(BaseEvaluator):
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        include_reason: bool = True,
+        model: Optional[str] = None,
+    ):
+        self.threshold = threshold
+        self.include_reason = include_reason
+        self.model = model
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None or contexts is None:
+            raise ValueError("Query, response, and contexts must be provided")
+
+        test_case = LLMTestCase(
+            input=query,
+            actual_output=response,
+            retrieval_context=conform_contexts_type(contexts),
+        )
+        metric = AnswerRelevancyMetric(
+            threshold=self.threshold,
+            include_reason=self.include_reason,
+            model=self.model,
+        )
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
+
+
+class FaithfulnessEvaluator(BaseEvaluator):
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        include_reason: bool = True,
+        model: Optional[str] = None,
+    ):
+        self.threshold = threshold
+        self.include_reason = include_reason
+        self.model = model
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None or contexts is None:
+            raise ValueError("Query, response, and contexts must be provided")
+
+        test_case = LLMTestCase(
+            input=query,
+            actual_output=response,
+            retrieval_context=conform_contexts_type(contexts),
+        )
+        metric = FaithfulnessMetric(
+            threshold=self.threshold,
+            include_reason=self.include_reason,
+            model=self.model,
+        )
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
+
+
+class ContextualRelevancyEvaluator(BaseEvaluator):
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        include_reason: bool = True,
+        model: Optional[str] = None,
+    ):
+        self.threshold = threshold
+        self.include_reason = include_reason
+        self.model = model
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None or contexts is None:
+            raise ValueError("Query, response, and contexts must be provided")
+
+        test_case = LLMTestCase(
+            input=query,
+            actual_output=response,
+            retrieval_context=conform_contexts_type(contexts),
+        )
+        metric = ContextualRelevancyMetric(
+            threshold=self.threshold,
+            include_reason=self.include_reason,
+            model=self.model,
+        )
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
+
+
+class SummarizationEvaluator(BaseEvaluator):
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        model: Optional[str] = None,
+    ):
+        self.threshold = threshold
+        self.model = model
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+        del contexts  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None:
+            raise ValueError("Query and response must be provided")
+
+        test_case = LLMTestCase(input=query, actual_output=response)
+        metric = SummarizationMetric(threshold=self.threshold, model=self.model)
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
+
+
+class BiasEvaluator(BaseEvaluator):
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+        del contexts  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None:
+            raise ValueError("Query and response must be provided")
+
+        test_case = LLMTestCase(
+            input=query,
+            actual_output=response,
+        )
+        metric = BiasMetric(threshold=self.threshold)
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
+
+
+class ToxicityEvaluator(BaseEvaluator):
+    def __init__(self, threshold: float = 0.5):
+        self.threshold = threshold
+
+    def _get_prompts(self):
+        pass
+
+    def _update_prompts(self):
+        pass
+
+    async def aevaluate(
+        self,
+        query: Optional[str] = None,
+        response: Optional[str] = None,
+        contexts: Optional[Sequence[str]] = None,
+        sleep_time_in_seconds: int = 0,
+        **kwargs: Any,
+    ) -> EvaluationResult:
+        del kwargs  # Unused
+        del contexts  # Unused
+
+        await asyncio.sleep(sleep_time_in_seconds)
+
+        if query is None or response is None:
+            raise ValueError("Query and response must be provided")
+
+        test_case = LLMTestCase(
+            input=query,
+            actual_output=response,
+        )
+        metric = ToxicityMetric(threshold=self.threshold)
+        metric.measure(test_case)
+        return EvaluationResult(
+            query=query,
+            response=response,
+            passing=metric.is_successful(),
+            score=metric.score,
+            feedback=metric.reason,
+        )
diff --git a/deepeval/integrations/llama_index/tests/test_evaluators.py b/deepeval/integrations/llama_index/tests/test_evaluators.py
new file mode 100644
index 000000000..0a5c251f0
--- /dev/null
+++ b/deepeval/integrations/llama_index/tests/test_evaluators.py
@@ -0,0 +1,39 @@
+import pytest
+from deepeval.integrations.llama_index import (
+    AnswerRelevancyEvaluator,
+    FaithfulnessEvaluator,
+    ContextualRelevancyEvaluator,
+    SummarizationEvaluator,
+    BiasEvaluator,
+    ToxicityEvaluator,
+)
+
+
+def test_answer_relevancy():
+    evaluator = AnswerRelevancyEvaluator()
+    assert evaluator is not None
+
+
+def test_faithfulness():
+    evaluator = FaithfulnessEvaluator()
+    assert evaluator is not None
+
+
+def test_contextual_relevancy():
+    evaluator = ContextualRelevancyEvaluator()
+    assert evaluator is not None
+
+
+def test_summarization():
+    evaluator = SummarizationEvaluator()
+    assert evaluator is not None
+
+
+def test_bias():
+    evaluator = BiasEvaluator()
+    assert evaluator is not None
+
+
+def test_toxicity():
+    evaluator = ToxicityEvaluator()
+    assert evaluator is not None
diff --git a/deepeval/integrations/llama_index/utils.py b/deepeval/integrations/llama_index/utils.py
new file mode 100644
index 000000000..073499c9a
--- /dev/null
+++ b/deepeval/integrations/llama_index/utils.py
@@ -0,0 +1,10 @@
+from typing import Optional, Sequence, List, Union
+
+
+def conform_contexts_type(
+    contexts: Optional[Sequence[str]] = None,
+) -> Union[List[str], None]:
+    if contexts is None:
+        return None
+
+    return list(contexts)
diff --git a/deepeval/metrics/summarization.py b/deepeval/metrics/summarization.py
index fb1a005c4..21195277a 100644
--- a/deepeval/metrics/summarization.py
+++ b/deepeval/metrics/summarization.py
@@ -12,6 +12,7 @@
     closed_end_questions_template,
     closed_end_answers_template,
 )
+from deepeval.progress_context import metrics_progress_context
 
 
 class ScoreType(Enum):
@@ -39,44 +40,51 @@ def measure(self, test_case: LLMTestCase):
         if test_case.input is None or test_case.actual_output is None:
             raise ValueError("Input or actual output cannot be None")
 
-        source_document = test_case.input
-        summary = test_case.actual_output
+        with metrics_progress_context(self.__name__, self.evaluation_model):
+            source_document = test_case.input
+            summary = test_case.actual_output
 
-        with ThreadPoolExecutor() as executor:
-            future_alignment = executor.submit(
-                self.get_score, ScoreType.ALIGNMENT, source_document, summary
-            )
-            future_inclusion = executor.submit(
-                self.get_score, ScoreType.INCLUSION, source_document, summary
-            )
+            with ThreadPoolExecutor() as executor:
+                future_alignment = executor.submit(
+                    self.get_score,
+                    ScoreType.ALIGNMENT,
+                    source_document,
+                    summary,
+                )
+                future_inclusion = executor.submit(
+                    self.get_score,
+                    ScoreType.INCLUSION,
+                    source_document,
+                    summary,
+                )
 
-            # Wait for the results
-            alignment_score = future_alignment.result()
-            inclusion_score = future_inclusion.result()
+                # Wait for the results
+                alignment_score = future_alignment.result()
+                inclusion_score = future_inclusion.result()
 
-        summarization_score = min(alignment_score, inclusion_score)
+            summarization_score = min(alignment_score, inclusion_score)
 
-        self.success = summarization_score >= self.threshold
-        self.score_breakdown = {
-            "Alignment": alignment_score,
-            "Inclusion": inclusion_score,
-        }
-        self.alignment_score = alignment_score
-        self.inclusion_score = inclusion_score
-        self.score = summarization_score
-        return self.score
+            self.success = summarization_score >= self.threshold
+            self.score_breakdown = {
+                "Alignment": alignment_score,
+                "Inclusion": inclusion_score,
+            }
+            self.alignment_score = alignment_score
+            self.inclusion_score = inclusion_score
+            self.score = summarization_score
+            return self.score
 
     def get_score(
         self, score_type: ScoreType, source_document: str, summary: str
     ):
         questions = []
         if score_type == ScoreType.ALIGNMENT:
-            print("Calculating alignment score...")
+            # print("Calculating alignment score...")
             questions = self.generate_questions(
                 score_type, source_document, summary
             )
         elif score_type == ScoreType.INCLUSION:
-            print("Calculating inclusion score...")
+            # print("Calculating inclusion score...")
             if self.assessment_questions is None:
                 questions = self.generate_questions(
                     score_type, source_document, summary
diff --git a/llama_test/chatbot.py b/llama_test/chatbot.py
index a07724b5e..cf5b2babc 100644
--- a/llama_test/chatbot.py
+++ b/llama_test/chatbot.py
@@ -2,13 +2,15 @@
 from llama_index import ServiceContext
 import llama_index
 
-llama_index.set_global_handler("deepeval")
-
-service_context = ServiceContext.from_defaults(chunk_size=1000)
+# llama_index.set_global_handler("deepeval")
+service_context = ServiceContext.from_defaults(chunk_size=500)
 documents = SimpleDirectoryReader("data").load_data()
 index = VectorStoreIndex.from_documents(documents)
 query_engine = index.as_query_engine(similarity_top_k=5)
 
 
 def query(user_input):
-    return query_engine.query(user_input).response
+    res = query_engine.query(user_input)
+    # evaluator = ToxicityEvaluator()
+    # result = evaluator.evaluate_response(query=user_input, response=res)
+    return res.response
diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
index 87efa45f9..851fd434e 100644
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@@ -11,7 +11,7 @@
 
 from datasets import load_dataset
 
-from deepeval.callbacks.huggingface import DeepEvalCallback
+from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
 from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
 from deepeval.dataset import EvaluationDataset, Golden
 
@@ -147,8 +147,8 @@ def create_deepeval_dataset(dataset, sample_size):
     )
     metrics = [hallucination_metric, answer_relevancy_metric]
 
-    # initalize DeepEvalCallback
-    callback = DeepEvalCallback(
+    # initalize DeepEvalHuggingFaceCallback
+    callback = DeepEvalHuggingFaceCallback(
         metrics=metrics,
         evaluation_dataset=eval_dataset,
         tokenizer_args=tokenizer_args,

From c040b5a9b1b1dd574d252ead3d7fe9a002c70481 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 20:40:18 -0800
Subject: [PATCH 41/74] fix tests

---
 .github/workflows/test.yml                                  | 2 +-
 .../integrations/hugging_face/tests}/test_callbacks.py      | 0
 tests/test_g_eval.py                                        | 6 ++----
 3 files changed, 3 insertions(+), 5 deletions(-)
 rename {tests => deepeval/integrations/hugging_face/tests}/test_callbacks.py (100%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6de173374..2e4e03476 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -65,4 +65,4 @@ jobs:
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
         run: |
-          poetry run pytest tests/ --ignore=tests/test_g_eval.py
+          poetry run pytest tests/
diff --git a/tests/test_callbacks.py b/deepeval/integrations/hugging_face/tests/test_callbacks.py
similarity index 100%
rename from tests/test_callbacks.py
rename to deepeval/integrations/hugging_face/tests/test_callbacks.py
diff --git a/tests/test_g_eval.py b/tests/test_g_eval.py
index e4c09a8d3..489822a0d 100644
--- a/tests/test_g_eval.py
+++ b/tests/test_g_eval.py
@@ -1,12 +1,10 @@
 import pytest
-import openai
 from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 from deepeval.metrics import GEval
 from deepeval import assert_test
 
-
-def test_chat_completion():
-    """Test Chat Completion"""
+@pytest.mark.skip(reason="openai is expensive")
+def test_g_eval():
     metric = GEval(
         name="Validity",
         criteria="The response is a valid response to the prompt.",

From 14e33fd1a94bad6fca8b261ae6e2618129b962b5 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 20:41:09 -0800
Subject: [PATCH 42/74] lint

---
 tests/test_g_eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_g_eval.py b/tests/test_g_eval.py
index 489822a0d..aa10d3adc 100644
--- a/tests/test_g_eval.py
+++ b/tests/test_g_eval.py
@@ -3,6 +3,7 @@
 from deepeval.metrics import GEval
 from deepeval import assert_test
 
+
 @pytest.mark.skip(reason="openai is expensive")
 def test_g_eval():
     metric = GEval(

From 94e5457f8250a805d60ee92da847fb4ccf17161d Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 22:57:02 -0800
Subject: [PATCH 43/74] Llamaindex docs and integration

---
 deepeval/integrations/llama_index/__init__.py |  12 +-
 .../llama_index/tests/test_evaluators.py      |  24 +-
 docs/docs/integrations-introduction.mdx       |   7 +-
 docs/docs/integrations-llamaindex.mdx         | 235 +++++++++++++-----
 4 files changed, 194 insertions(+), 84 deletions(-)

diff --git a/deepeval/integrations/llama_index/__init__.py b/deepeval/integrations/llama_index/__init__.py
index d471bc234..8397dec20 100644
--- a/deepeval/integrations/llama_index/__init__.py
+++ b/deepeval/integrations/llama_index/__init__.py
@@ -1,9 +1,9 @@
 from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler
 from deepeval.integrations.llama_index.evaluators import (
-    AnswerRelevancyEvaluator,
-    FaithfulnessEvaluator,
-    ContextualRelevancyEvaluator,
-    SummarizationEvaluator,
-    ToxicityEvaluator,
-    BiasEvaluator,
+    AnswerRelevancyEvaluator as DeepEvalAnswerRelevancyEvaluator,
+    FaithfulnessEvaluator as DeepEvalFaithfulnessEvaluator,
+    ContextualRelevancyEvaluator as DeepEvalContextualRelevancyEvaluator,
+    SummarizationEvaluator as DeepEvalSummarizationEvaluator,
+    ToxicityEvaluator as DeepEvalToxicityEvaluator,
+    BiasEvaluator as DeepEvalBiasEvaluator,
 )
diff --git a/deepeval/integrations/llama_index/tests/test_evaluators.py b/deepeval/integrations/llama_index/tests/test_evaluators.py
index 0a5c251f0..45d46c4ca 100644
--- a/deepeval/integrations/llama_index/tests/test_evaluators.py
+++ b/deepeval/integrations/llama_index/tests/test_evaluators.py
@@ -1,39 +1,39 @@
 import pytest
 from deepeval.integrations.llama_index import (
-    AnswerRelevancyEvaluator,
-    FaithfulnessEvaluator,
-    ContextualRelevancyEvaluator,
-    SummarizationEvaluator,
-    BiasEvaluator,
-    ToxicityEvaluator,
+    DeepEvalAnswerRelevancyEvaluator,
+    DeepEvalFaithfulnessEvaluator,
+    DeepEvalContextualRelevancyEvaluator,
+    DeepEvalSummarizationEvaluator,
+    DeepEvalBiasEvaluator,
+    DeepEvalToxicityEvaluator,
 )
 
 
 def test_answer_relevancy():
-    evaluator = AnswerRelevancyEvaluator()
+    evaluator = DeepEvalAnswerRelevancyEvaluator()
     assert evaluator is not None
 
 
 def test_faithfulness():
-    evaluator = FaithfulnessEvaluator()
+    evaluator = DeepEvalFaithfulnessEvaluator()
     assert evaluator is not None
 
 
 def test_contextual_relevancy():
-    evaluator = ContextualRelevancyEvaluator()
+    evaluator = DeepEvalContextualRelevancyEvaluator()
     assert evaluator is not None
 
 
 def test_summarization():
-    evaluator = SummarizationEvaluator()
+    evaluator = DeepEvalSummarizationEvaluator()
     assert evaluator is not None
 
 
 def test_bias():
-    evaluator = BiasEvaluator()
+    evaluator = DeepEvalBiasEvaluator()
     assert evaluator is not None
 
 
 def test_toxicity():
-    evaluator = ToxicityEvaluator()
+    evaluator = DeepEvalToxicityEvaluator()
     assert evaluator is not None
diff --git a/docs/docs/integrations-introduction.mdx b/docs/docs/integrations-introduction.mdx
index 5924b184c..73be7b434 100644
--- a/docs/docs/integrations-introduction.mdx
+++ b/docs/docs/integrations-introduction.mdx
@@ -8,8 +8,7 @@ sidebar_label: Introduction
 
 `deepeval` offers multiple integrations for those who have already built LLM apps using other frameworks. We currently support:
 
-- lLamaindex
-- _LangChain (to be documented...)_
-- _Guardrails (to be documented...)_
+- LlamaIndex
+- _Hugging Face (to be documented...)_
 
-You're by no means required to leverage these integrations as `deepeval` is not vendor locked into any framework. After all, all we need is data being passed to `deepeval` to evaluate your LLM application. However, you may find our integrations helpful in keeping your codebase cleaner, so we recommend giving it a try if you're looking to optimize for readability and maintability.
+You're by no means required to leverage these integrations as `deepeval` is not vendor locked-in into any framework. After all, all we need are data from test cases to evaluate your LLM application. However, you may find our integrations helpful in keeping your codebase cleaner, so we recommend giving it a try if you're looking to optimize for readability and maintability.
diff --git a/docs/docs/integrations-llamaindex.mdx b/docs/docs/integrations-llamaindex.mdx
index af5549153..27d2de87f 100644
--- a/docs/docs/integrations-llamaindex.mdx
+++ b/docs/docs/integrations-llamaindex.mdx
@@ -1,100 +1,211 @@
 ---
 id: integrations-llamaindex
-title: Evaluating LlamaIndex
+title: LlamaIndex
 sidebar_label: LlamaIndex
 ---
 
 ## Quick Summary
 
-DeepEval integrates nicely with LlamaIndex's `ResponseEvaluator` class. Below is an example of the factual consistency documentation.
+LlamaIndex is a data framework for LLMs that facilitates the ingestion of data from various sources such as APIs, databases, and PDFs, and indexes it for later retrieval in RAG-based LLM applications.
+
+## Evaluating LlamaIndex (RAG) Applications
+
+RAG applications built using LlamaIndex can be easily evaluated within `deepeval`. Lets use this RAG application built using Llamaindex as an example:
+
+```python
+from llama_index import VectorStoreIndex, SimpleDirectoryReader
+
+# Consult the LlamaIndex docs if you're unsure what this does
+documents = SimpleDirectoryReader("YOUR_DATA_DIRECTORY").load_data()
+index = VectorStoreIndex.from_documents(documents)
+rag_application = index.as_query_engine()
+```
+
+You can then query your RAG application and evaluate each response using `deepeval`'s metrics:
+
+```python
+from deepeval.metrics import AnswerRelevancyMetric
+from deepeval.test_case import LLMTestCase
+...
+
+# An example input to your RAG application
+user_input = "What is LlamaIndex?"
+
+# LlamaIndex returns a response object that contains
+# both the output string and retrieved nodes
+response_object = rag_application.query(user_input)
+
+# Process the response object to get the output string
+# and retrieved nodes
+if response_object is not None:
+    actual_output = response_object.response
+    retrieval_context = [node.get_content() for node in response.source_nodes]
+
+# Create a test case and metric as usual
+test_case = LLMTestCase(
+    input=user_input,
+    actual_output=actual_output,
+    retrieval_context=retrieval_context
+)
+answer_relevancy_metric = AnswerRelevancyMetric()
+
+# Evaluate
+answer_relevancy_metric.measure(test_case)
+print(answer_relevancy_metric.score)
+print(answer_relevancy_metric.reason)
+```
+
+:::info
+You can also extract all necessary outputs and retrieval contexts for each given input to your LlamaIndex application to [create an `EvaluationDataset` to evaluate test cases in bulk.](evaluation-datasets)
+:::
+
+## Using DeepEval for LlamaIndex
+
+In LlamaIndex, there are entities known as evaluators that evaluates the responses of LlamaIndex applications. Continuing from the previous example, here's an alternative way to make use of the `AnswerRelevancyMetric` through `deepeval`'s LlamaIndex evaluators:
 
 ```python
-from llama_index.response.schema import Response
-from typing import List
-from llama_index.schema import Document
-from deepeval.metrics import HallucinationMetric
-from llama_index import (
-    TreeIndex,
-    VectorStoreIndex,
-    SimpleDirectoryReader,
-    LLMPredictor,
-    ServiceContext,
-    Response,
+from deepeval.integrations.llamaindex import DeepEvalAnswerRelevancyEvaluator
+...
+
+# An example input to your RAG application
+user_input = "What is LlamaIndex?"
+
+# LlamaIndex returns a response object that contains
+# both the output string and retrieved nodes
+response_object = rag_application.query(user_input)
+
+answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
+evaluation_result = answer_relevancy_evaluator.evaluate_response(
+    query=user_input,
+    response=response_object
 )
-from llama_index.llms import OpenAI
-from llama_index.evaluation import ResponseEvaluator
+print(evaluation_result)
+```
 
-import os
-import openai
+:::note
+In LlamaIndex's documentation, you might see examples where the `evaluate()` method is called on evaluators instead of the `evaluate_response()` method. While both is correct, you should **ALWAYS** use the `evaluate_response()` methods when using `deepeval`'s LlamaIndex evaluators.
+:::
 
-api_key = "sk-XXX"
-openai.api_key = api_key
+### Answer Relevancy
 
-gpt4 = OpenAI(temperature=0, model="gpt-4", api_key=api_key)
-service_context_gpt4 = ServiceContext.from_defaults(llm=gpt4)
-evaluator_gpt4 = ResponseEvaluator(service_context=service_context_gpt4)
+The `DeepEvalAnswerRelevancyEvaluator` uses `deepeval`'s `AnswerRelevancyMetric` for evaluation.
+
+```python
+from deepeval.integrations.llamaindex import DeepEvalAnswerRelevancyEvaluator
+
+evaluator = DeepEvalAnswerRelevancyEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5,
+    # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'.
+    model="gpt-4-1106-preview",
+    # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`.
+    include_reason=True
+)
 ```
 
-Getting a lLamaHub Loader
+### Faithfulness
+
+The `DeepEvalFaithfulnessEvaluator` uses `deepeval`'s `FaithfulnessMetric` for evaluation.
 
 ```python
-from llama_index import download_loader
+from deepeval.integrations.llamaindex import DeepEvalFaithfulnessEvaluator
+
+evaluator = DeepEvalFaithfulnessEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5,
+    # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'.
+    model="gpt-4-1106-preview",
+    # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`.
+    include_reason=True
+)
+```
+
+### Contextual Relevancy
 
-WikipediaReader = download_loader("WikipediaReader")
+The `DeepEvalContextualRelevancyEvaluator` uses `deepeval`'s `ContextualRelevancyMetric` for evaluation.
 
-loader = WikipediaReader()
-documents = loader.load_data(pages=['Tokyo'])
-tree_index = TreeIndex.from_documents(documents=documents)
-vector_index = VectorStoreIndex.from_documents(
-    documents, service_context=service_context_gpt4
+```python
+from deepeval.integrations.llamaindex import DeepEvalContextualRelevancyEvaluator
+
+evaluator = DeepEvalContextualRelevancyEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5,
+    # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'.
+    model="gpt-4-1106-preview",
+    # Optional. A boolean which when set to `True`, will include a reason for its evaluation score, defaulted to `True`.
+    include_reason=True
 )
 ```
 
-We then build an evaluator based on the BaseEvaluator class that requires an evaluate method.
+### Summarization
 
-In this example, we show you how to write a factual consistency check.
+The `DeepEvalSummarizationEvaluator` uses `deepeval`'s `SummarizationMetric` for evaluation.
 
 ```python
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import HallucinationMetric
+from deepeval.integrations.llamaindex import DeepEvalSummarizationEvaluator
 
-class HallucinationResponseEvaluator:
-    def get_context(self, response: Response) -> List[Document]:
-        """Get context information from given Response object using source nodes.
+evaluator = DeepEvalSummarizationEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5,
+    # Optional. A string specifying which of OpenAI's GPT models to use, defaulted to 'gpt-4-1106-preview'.
+    model="gpt-4-1106-preview"
+)
+```
 
-        Args:
-            response (Response): Response object from an index based on the query.
+### Bias
 
-        Returns:
-            List of Documents of source nodes information as context information.
-        """
-        context = []
+The `DeepEvalBiasEvaluator` uses `deepeval`'s `BiasMetric` for evaluation.
+
+```python
+from deepeval.integrations.llamaindex import DeepEvalBiasEvaluator
 
-        for context_info in response.source_nodes:
-            context.append(Document(text=context_info.node.get_content()))
+evaluator = DeepEvalBiasEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5
+)
+```
 
-        return context
+### Toxicity
 
-    def evaluate(self, response: Response) -> str:
+The `DeepEvalToxicityEvaluator` uses `deepeval`'s `ToxicityMetric` for evaluation.
 
-        # Evaluate factual consistency metrics
-        answer = str(response)
-        metric = HallucinationMetric()
-        context = self.get_context(response)
-        test_case = LLMTestCase(input="This is an example input", context=context, actual_output=answer)
-        score = metric.measure(test_case=test_case)
-        if metric.is_successful():
-            return "YES"
-        else:
-            return "NO"
+```python
+from deepeval.integrations.llamaindex import DeepEvalToxicityEvaluator
 
-evaluator = HallucinationResponseEvaluator()
+evaluator = DeepEvalToxicityEvaluator(
+    # Optional. A float representing the minimum passing threshold, defaulted to 0.5.
+    threshold=0.5
+)
 ```
 
-You can then evaluate as such:
+## Metrics vs Evaluators
+
+While both `deepeval`'s metrics and evaluators yield the same result, `deepeval` is a full evaluation suite built specifically for LLM evaluation. Naturally, `deepeval` forces you to follow evaluation best practices, something not accomplishable through the use of the evaluators abstraction.
+
+So while both metrics and evaluators can be used for a one-off, standalone evaluation, metrics:
+
+- can be combined to evaluate multiple criteria asynchronously
+- can be used to evaluate entire `EvaluationDataset`s
+- can leverage `deepeval`'s native Pytest integration to unit test LlamaIndex applications in CI/CD pipelines
+- can be used with any framework, meaning you are not vendor locked-in into LlamaIndex
+- covers a wider range of evaluation criteria/use cases
+- automatically integrates with [Confident AI](confident-ai-introduction), which offers evaluation analysis, evaluation debugging, dataset management, and real-time evaluations in production
+
+:::note
+The only upside of using `deepeval`'s LlamaIndex evaluators instead of metrics, is an evaluator automatically extracts the `retrieval_context` from a LlamaIndex response. However, as shown in previous examples, manually extracting the `retrieval_context` from a LlamaIndex response is extremely straightforward:
 
 ```python
-query_engine = tree_index.as_query_engine()
-response = query_engine.query("How did Tokyo get its name?")
-eval_result = evaluator.evaluate(response)
+...
+
+# LlamaIndex returns a response object that contains
+# both the output string and retrieved nodes
+response_object = rag_application.query(user_input)
+
+# Process the response object to get the output string
+# and retrieved nodes
+if response_object is not None:
+    actual_output = response_object.response
+    retrieval_context = [node.get_content() for node in response.source_nodes]
 ```
+
+:::

From a88314a89b4e6c1de399512507e8d4f3939d6cba Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 23:00:20 -0800
Subject: [PATCH 44/74] Updated docs

---
 docs/docs/confident-ai-evals-in-production.mdx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/docs/confident-ai-evals-in-production.mdx b/docs/docs/confident-ai-evals-in-production.mdx
index bc06ebda9..7ff047b0c 100644
--- a/docs/docs/confident-ai-evals-in-production.mdx
+++ b/docs/docs/confident-ai-evals-in-production.mdx
@@ -14,7 +14,6 @@ Simply add `deepeval.track(...)` in your application to start tracking events.
 
 ```python
 import deepeval
-...
 
 # At the end of your LLM call
 deepeval.track(
@@ -32,7 +31,6 @@ deepeval.track(
     fail_silently=True,
     run_on_background_thread=True
 )
-
 ```
 
 The `track()` function takes in the following arguments:

From 20d854b488b4d08605fdc8d692ed5e068dba1bd1 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 23:05:23 -0800
Subject: [PATCH 45/74] .

---
 docs/docs/integrations-llamaindex.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/integrations-llamaindex.mdx b/docs/docs/integrations-llamaindex.mdx
index 27d2de87f..af767ccd9 100644
--- a/docs/docs/integrations-llamaindex.mdx
+++ b/docs/docs/integrations-llamaindex.mdx
@@ -83,7 +83,7 @@ print(evaluation_result)
 ```
 
 :::note
-In LlamaIndex's documentation, you might see examples where the `evaluate()` method is called on evaluators instead of the `evaluate_response()` method. While both is correct, you should **ALWAYS** use the `evaluate_response()` methods when using `deepeval`'s LlamaIndex evaluators.
+In LlamaIndex's documentation, you might see examples where the `evaluate()` method is called on an evaluator instead of the `evaluate_response()` method. While both is correct, you should **ALWAYS** use the `evaluate_response()` methods when using `deepeval`'s LlamaIndex evaluators.
 :::
 
 ### Answer Relevancy

From aac58342c1c66ec4b8ad50161e4d27c4486a01cb Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 23:09:44 -0800
Subject: [PATCH 46/74] new release

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index 815f41ae5..1f97ae6e7 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.52"
+__version__: str = "0.20.53"
diff --git a/pyproject.toml b/pyproject.toml
index ca5894494..9125de7f3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.52"
+version = "0.20.53"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From f1c7a6e0f43640be90fe225fa799bd4c3bec36e0 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 24 Jan 2024 23:21:45 -0800
Subject: [PATCH 47/74] Update docs

---
 docs/docs/integrations-llamaindex.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs/integrations-llamaindex.mdx b/docs/docs/integrations-llamaindex.mdx
index af767ccd9..072677460 100644
--- a/docs/docs/integrations-llamaindex.mdx
+++ b/docs/docs/integrations-llamaindex.mdx
@@ -74,8 +74,8 @@ user_input = "What is LlamaIndex?"
 # both the output string and retrieved nodes
 response_object = rag_application.query(user_input)
 
-answer_relevancy_evaluator = DeepEvalAnswerRelevancyEvaluator()
-evaluation_result = answer_relevancy_evaluator.evaluate_response(
+evaluator = DeepEvalAnswerRelevancyEvaluator()
+evaluation_result = evaluator.evaluate_response(
     query=user_input,
     response=response_object
 )

From da2f79932f9fae3c2c5b011c46dc5631e560f7dc Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Thu, 25 Jan 2024 03:03:49 -0800
Subject: [PATCH 48/74] Conditional import

---
 deepeval/integrations/harness/callback.py     |  27 +-
 .../integrations/hugging_face/callback.py     | 434 +++++++++---------
 deepeval/models/answer_relevancy_model.py     |   2 +-
 deepeval/models/hallucination_model.py        |   7 +-
 4 files changed, 247 insertions(+), 223 deletions(-)

diff --git a/deepeval/integrations/harness/callback.py b/deepeval/integrations/harness/callback.py
index bbb8ce9a5..e8896a09f 100644
--- a/deepeval/integrations/harness/callback.py
+++ b/deepeval/integrations/harness/callback.py
@@ -1,17 +1,26 @@
 from typing import List, Union
 
-from transformers.trainer_callback import TrainerCallback
 
 # from deepeval.experimental import BaseEvaluationExperiment
 
+try:
+    from transformers.trainer_callback import TrainerCallback
 
-class DeepEvalHarnessCallback(TrainerCallback):
-    """
-    A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
-    """
+    class DeepEvalHarnessCallback(TrainerCallback):
+        """
+        A [transformers.TrainerCallback] that logs various harness LLM evaluation metrics to DeepEval
+        """
 
-    def __init__(self, experiments):
-        super().__init__()
-        self.experiments = experiments
+        def __init__(self, experiments):
+            super().__init__()
+            self.experiments = experiments
 
-        raise NotImplementedError("DeepEvalHarnessCallback is WIP")
+            raise NotImplementedError("DeepEvalHarnessCallback is WIP")
+
+except ImportError:
+
+    class DeepEvalHarnessCallback:
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "The 'transformers' library is required to use the DeepEvalHarnessCallback."
+            )
diff --git a/deepeval/integrations/hugging_face/callback.py b/deepeval/integrations/hugging_face/callback.py
index f18810bcd..7c273d05a 100644
--- a/deepeval/integrations/hugging_face/callback.py
+++ b/deepeval/integrations/hugging_face/callback.py
@@ -1,226 +1,236 @@
 from typing import Union, List, Dict
-
-from transformers import (
-    TrainerCallback,
-    ProgressCallback,
-    Trainer,
-    TrainingArguments,
-    TrainerState,
-    TrainerControl,
-)
+from .utils import get_column_order, generate_test_cases
+from .rich_manager import RichManager
 
 from deepeval.metrics import BaseMetric
 from deepeval.evaluate import execute_test
 from deepeval.dataset import EvaluationDataset
 
-from .utils import get_column_order, generate_test_cases
-from .rich_manager import RichManager
-
+try:
+    from transformers import (
+        TrainerCallback,
+        ProgressCallback,
+        Trainer,
+        TrainingArguments,
+        TrainerState,
+        TrainerControl,
+    )
 
-class DeepEvalHuggingFaceCallback(TrainerCallback):
-    """
-    Custom callback for deep evaluation during model training.
-
-    Args:
-        metrics (Union[BaseMetric, List[BaseMetric]]): Evaluation metrics.
-        evaluation_dataset (EvaluationDataset): Dataset for evaluation.
-        tokenizer_args (Dict): Arguments for the tokenizer.
-        aggregation_method (str): Method for aggregating metric scores.
-        trainer (Trainer): Model trainer.
-    """
-
-    def __init__(
-        self,
-        metrics: Union[BaseMetric, List[BaseMetric]] = None,
-        evaluation_dataset: EvaluationDataset = None,
-        tokenizer_args: Dict = None,
-        aggregation_method: str = "avg",
-        trainer: Trainer = None,
-        show_table: bool = False,
-        show_table_every: int = 1,
-    ) -> None:
-        super().__init__()
-
-        self.show_table = show_table
-        self.show_table_every = show_table_every
-        self.metrics = metrics
-        self.evaluation_dataset = evaluation_dataset
-        self.tokenizer_args = tokenizer_args
-        self.aggregation_method = aggregation_method
-        self.trainer = trainer
-
-        self.task_descriptions = {
-            "generating": "[blue][STATUS] [white]Generating output from model (might take up few minutes)",
-            "training": "[blue][STATUS] [white]Training in Progress",
-            "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
-            "training_end": "[blue][STATUS] [white]Training Ended",
-        }
-
-        self.train_bar_started = False
-        self.epoch_counter = 0
-        self.deepeval_metric_history = []
-
-        total_train_epochs = self.trainer.args.num_train_epochs
-        self.rich_manager = RichManager(show_table, total_train_epochs)
-        self.trainer.remove_callback(ProgressCallback)
-
-    def _calculate_metric_scores(self) -> Dict[str, List[float]]:
+    class DeepEvalHuggingFaceCallback(TrainerCallback):
         """
-        Calculate final evaluation scores based on metrics and test cases.
-
-        Returns:
-            Dict[str, List[float]]: Metric scores for each test case.
-        """
-        test_results = execute_test(
-            test_cases=self.evaluation_dataset.test_cases, metrics=self.metrics
-        )
-        scores = {}
-        for test in test_results:
-            for metric in test.metrics:
-                metric_name = str(metric.__name__).lower().replace(" ", "_")
-                metric_score = metric.score
-                scores.setdefault(metric_name, []).append(metric_score)
-
-        scores = self._aggregate_scores(scores)
-        return scores
-
-    def _aggregate_scores(
-        self, scores: Dict[str, List[float]]
-    ) -> Dict[str, float]:
-        """
-        Aggregate metric scores using the specified method.
+        Custom callback for deep evaluation during model training.
 
         Args:
-            aggregation_method (str): Method for aggregating scores.
-            scores (Dict[str, List[float]]): Metric scores for each test case.
-
-        Returns:
-            Dict[str, float]: Aggregated metric scores.
-        """
-        aggregation_functions = {
-            "avg": lambda x: sum(x) / len(x),
-            "max": max,
-            "min": min,
-        }
-        if self.aggregation_method not in aggregation_functions:
-            raise ValueError(
-                "Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']"
+            metrics (Union[BaseMetric, List[BaseMetric]]): Evaluation metrics.
+            evaluation_dataset (EvaluationDataset): Dataset for evaluation.
+            tokenizer_args (Dict): Arguments for the tokenizer.
+            aggregation_method (str): Method for aggregating metric scores.
+            trainer (Trainer): Model trainer.
+        """
+
+        def __init__(
+            self,
+            metrics: Union[BaseMetric, List[BaseMetric]] = None,
+            evaluation_dataset: EvaluationDataset = None,
+            tokenizer_args: Dict = None,
+            aggregation_method: str = "avg",
+            trainer: Trainer = None,
+            show_table: bool = False,
+            show_table_every: int = 1,
+        ) -> None:
+            super().__init__()
+
+            self.show_table = show_table
+            self.show_table_every = show_table_every
+            self.metrics = metrics
+            self.evaluation_dataset = evaluation_dataset
+            self.tokenizer_args = tokenizer_args
+            self.aggregation_method = aggregation_method
+            self.trainer = trainer
+
+            self.task_descriptions = {
+                "generating": "[blue][STATUS] [white]Generating output from model (might take up few minutes)",
+                "training": "[blue][STATUS] [white]Training in Progress",
+                "evaluate": "[blue][STATUS] [white]Evaluating test-cases (might take up few minutes)",
+                "training_end": "[blue][STATUS] [white]Training Ended",
+            }
+
+            self.train_bar_started = False
+            self.epoch_counter = 0
+            self.deepeval_metric_history = []
+
+            total_train_epochs = self.trainer.args.num_train_epochs
+            self.rich_manager = RichManager(show_table, total_train_epochs)
+            self.trainer.remove_callback(ProgressCallback)
+
+        def _calculate_metric_scores(self) -> Dict[str, List[float]]:
+            """
+            Calculate final evaluation scores based on metrics and test cases.
+
+            Returns:
+                Dict[str, List[float]]: Metric scores for each test case.
+            """
+            test_results = execute_test(
+                test_cases=self.evaluation_dataset.test_cases,
+                metrics=self.metrics,
             )
-        return {
-            key: aggregation_functions[self.aggregation_method](value)
-            for key, value in scores.items()
-        }
-
-    def on_epoch_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """
-        Event triggered at the begining of each training epoch.
-        """
-        self.epoch_counter += 1
-
-    def on_epoch_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """
-        Event triggered at the end of each training epoch.
-        """
-        control.should_log = True
-        self.rich_manager.change_spinner_text(
-            self.task_descriptions["generating"]
-        )
-        test_cases = generate_test_cases(
-            self.trainer.model,
-            self.trainer.tokenizer,
-            self.tokenizer_args,
-            self.evaluation_dataset,
-        )
-        self.evaluation_dataset.test_cases = test_cases
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """
-        Event triggered after logging the last logs.
-        """
-        if (
-            self.show_table
-            and len(state.log_history) <= self.trainer.args.num_train_epochs
-        ):
-            self.rich_manager.advance_progress()
-
-            if self.epoch_counter % self.show_table_every == 0:
-                self.rich_manager.change_spinner_text(
-                    self.task_descriptions["evaluate"]
-                )
-
-                scores = self._calculate_metric_scores()
-                self.deepeval_metric_history.append(scores)
-                self.deepeval_metric_history[-1].update(state.log_history[-1])
-
-                self.rich_manager.change_spinner_text(
-                    self.task_descriptions["training"]
+            scores = {}
+            for test in test_results:
+                for metric in test.metrics:
+                    metric_name = str(metric.__name__).lower().replace(" ", "_")
+                    metric_score = metric.score
+                    scores.setdefault(metric_name, []).append(metric_score)
+
+            scores = self._aggregate_scores(scores)
+            return scores
+
+        def _aggregate_scores(
+            self, scores: Dict[str, List[float]]
+        ) -> Dict[str, float]:
+            """
+            Aggregate metric scores using the specified method.
+
+            Args:
+                aggregation_method (str): Method for aggregating scores.
+                scores (Dict[str, List[float]]): Metric scores for each test case.
+
+            Returns:
+                Dict[str, float]: Aggregated metric scores.
+            """
+            aggregation_functions = {
+                "avg": lambda x: sum(x) / len(x),
+                "max": max,
+                "min": min,
+            }
+            if self.aggregation_method not in aggregation_functions:
+                raise ValueError(
+                    "Incorrect 'aggregation_method', only accepts ['avg', 'min, 'max']"
                 )
-                columns = self._generate_table()
-                self.rich_manager.update(columns)
-
-    def _generate_table(self):
-        """
-        Generates table, along with progress bars
-
-        Returns:
-            rich.Columns: contains table and 2 progress bars
-        """
-        column, table = self.rich_manager.create_column()
-        order = get_column_order(self.deepeval_metric_history[-1])
-
-        if self.show_table:
-            for key in order:
-                table.add_column(key)
-
-            for row in self.deepeval_metric_history:
-                table.add_row(*[str(row[value]) for value in order])
+            return {
+                key: aggregation_functions[self.aggregation_method](value)
+                for key, value in scores.items()
+            }
+
+        def on_epoch_begin(
+            self,
+            args: TrainingArguments,
+            state: TrainerState,
+            control: TrainerControl,
+            **kwargs,
+        ):
+            """
+            Event triggered at the begining of each training epoch.
+            """
+            self.epoch_counter += 1
+
+        def on_epoch_end(
+            self,
+            args: TrainingArguments,
+            state: TrainerState,
+            control: TrainerControl,
+            **kwargs,
+        ):
+            """
+            Event triggered at the end of each training epoch.
+            """
+            control.should_log = True
+            self.rich_manager.change_spinner_text(
+                self.task_descriptions["generating"]
+            )
+            test_cases = generate_test_cases(
+                self.trainer.model,
+                self.trainer.tokenizer,
+                self.tokenizer_args,
+                self.evaluation_dataset,
+            )
+            self.evaluation_dataset.test_cases = test_cases
+
+        def on_log(
+            self,
+            args: TrainingArguments,
+            state: TrainerState,
+            control: TrainerControl,
+            **kwargs,
+        ):
+            """
+            Event triggered after logging the last logs.
+            """
+            if (
+                self.show_table
+                and len(state.log_history) <= self.trainer.args.num_train_epochs
+            ):
+                self.rich_manager.advance_progress()
+
+                if self.epoch_counter % self.show_table_every == 0:
+                    self.rich_manager.change_spinner_text(
+                        self.task_descriptions["evaluate"]
+                    )
+
+                    scores = self._calculate_metric_scores()
+                    self.deepeval_metric_history.append(scores)
+                    self.deepeval_metric_history[-1].update(
+                        state.log_history[-1]
+                    )
+
+                    self.rich_manager.change_spinner_text(
+                        self.task_descriptions["training"]
+                    )
+                    columns = self._generate_table()
+                    self.rich_manager.update(columns)
+
+        def _generate_table(self):
+            """
+            Generates table, along with progress bars
+
+            Returns:
+                rich.Columns: contains table and 2 progress bars
+            """
+            column, table = self.rich_manager.create_column()
+            order = get_column_order(self.deepeval_metric_history[-1])
+
+            if self.show_table:
+                for key in order:
+                    table.add_column(key)
+
+                for row in self.deepeval_metric_history:
+                    table.add_row(*[str(row[value]) for value in order])
+
+            return column
+
+        def on_train_end(
+            self,
+            args: TrainingArguments,
+            state: TrainerState,
+            control: TrainerControl,
+            **kwargs,
+        ):
+            """
+            Event triggered at the end of model training.
+            """
+            self.rich_manager.change_spinner_text(
+                self.task_descriptions["training_end"]
+            )
+            self.rich_manager.stop()
+
+        def on_train_begin(
+            self,
+            args: TrainingArguments,
+            state: TrainerState,
+            control: TrainerControl,
+            **kwargs,
+        ):
+            """
+            Event triggered at the begining of model training.
+            """
+            self.rich_manager.start()
+            self.rich_manager.change_spinner_text(
+                self.task_descriptions["training"]
+            )
 
-        return column
+except ImportError:
 
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """
-        Event triggered at the end of model training.
-        """
-        self.rich_manager.change_spinner_text(
-            self.task_descriptions["training_end"]
-        )
-        self.rich_manager.stop()
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """
-        Event triggered at the begining of model training.
-        """
-        self.rich_manager.start()
-        self.rich_manager.change_spinner_text(
-            self.task_descriptions["training"]
-        )
+    class DeepEvalHuggingFaceCallback:
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "The 'transformers' library is required to use the DeepEvalHuggingFaceCallback."
+            )
diff --git a/deepeval/models/answer_relevancy_model.py b/deepeval/models/answer_relevancy_model.py
index 88ee30391..f1be5dca7 100644
--- a/deepeval/models/answer_relevancy_model.py
+++ b/deepeval/models/answer_relevancy_model.py
@@ -42,7 +42,7 @@ def _call(self, text: str):
 
 
 class CrossEncoderAnswerRelevancyModel(DeepEvalBaseModel):
-    def __init__(self, model_name: str | None = None):
+    def __init__(self, model_name: Optional[str] = None):
         model_name = (
             "cross-encoder/nli-deberta-v3-base"
             if model_name is None
diff --git a/deepeval/models/hallucination_model.py b/deepeval/models/hallucination_model.py
index 65c4681bf..6e124c0d9 100644
--- a/deepeval/models/hallucination_model.py
+++ b/deepeval/models/hallucination_model.py
@@ -1,12 +1,17 @@
 import os
 from typing import Optional
 from deepeval.singleton import Singleton
-from sentence_transformers import CrossEncoder
 from deepeval.progress_context import progress_context
 
 
 class HallucinationModel(metaclass=Singleton):
     def __init__(self, model_name: Optional[str] = None):
+        try:
+            from sentence_transformers import CrossEncoder
+        except ImportError:
+            raise ImportError(
+                "The 'sentence_transformers' library is required to use the HallucinationMetric."
+            )
         # We use a smple cross encoder model
         model_name = (
             "vectara/hallucination_evaluation_model"

From e107c40795b9e7058e3c878e653e87b1fe687be2 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Thu, 25 Jan 2024 15:14:28 -0800
Subject: [PATCH 49/74] added deployment flag

---
 deepeval/cli/test.py          |  9 ++++++---
 deepeval/plugins/plugin.py    | 10 ++++++----
 deepeval/test_run/test_run.py |  2 ++
 deepeval/utils.py             | 37 +++++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 58fd5b300..1664356fa 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -4,7 +4,7 @@
 from typing_extensions import Annotated
 from typing import Optional
 from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
-from deepeval.utils import delete_file_if_exists
+from deepeval.utils import delete_file_if_exists, get_ci_env
 from deepeval.test_run import invoke_test_run_end_hook
 from deepeval.telemetry import capture_evaluation_count
 
@@ -59,8 +59,11 @@ def run(
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
-    if deployment:
-        pytest_args.append("--deployment")
+    ci_env = get_ci_env()
+    if ci_env is not None:
+        pytest_args.extend(["--deployment", ci_env])
+    elif deployment:
+        pytest_args.extend(["--deployment", ""])
 
     pytest_args.extend(
         [
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 3814356ef..67231e011 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -9,8 +9,10 @@
 def pytest_sessionstart(session: pytest.Session):
     test_run_manager.save_to_disk = True
     try:
+        deployment = session.config.getoption("--deployment")
         test_run_manager.create_test_run(
-            deployment=session.config.getoption("--deployment"),
+            # TODO: change to deployment
+            deployment=False,
             file_name=session.config.getoption("file_or_dir")[0],
         )
     except:
@@ -20,9 +22,9 @@ def pytest_sessionstart(session: pytest.Session):
 def pytest_addoption(parser):
     parser.addoption(
         "--deployment",
-        action="store_true",
-        default=False,
-        help="Enable deployment mode",
+        action="store",
+        default=None,
+        help="Set deployment mode (optionally provide a string value)",
     )
 
 
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 3f5474deb..83426549d 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -64,6 +64,7 @@ class TestRun(BaseModel):
         None,
         alias="testFile",
     )
+    # TODO: change to Optional[str]
     deployment: Optional[bool] = Field(True)
     dict_test_cases: Dict[int, APITestCase] = Field(
         default_factory=dict,
@@ -159,6 +160,7 @@ def set_test_run(self, test_run: TestRun):
 
     def create_test_run(
         self,
+        # TODO: change to Optional[str]
         deployment: Optional[bool] = False,
         file_name: Optional[str] = None,
     ):
diff --git a/deepeval/utils.py b/deepeval/utils.py
index 543468007..e7fc08745 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -14,6 +14,43 @@
 from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
 
 
+def get_ci_env():
+    # CircleCI
+    if os.getenv("CIRCLECI") == "true":
+        return "CircleCI"
+
+    # Travis CI
+    if os.getenv("TRAVIS") == "true":
+        return "Travis CI"
+
+    # GitLab CI
+    if os.getenv("GITLAB_CI") == "true":
+        return "GitLab CI"
+
+    # GitHub Actions
+    if os.getenv("GITHUB_ACTIONS") == "true":
+        return "GitHub Actions"
+
+    # Jenkins
+    if os.getenv("JENKINS_URL"):
+        return "Jenkins"
+
+    # Bitbucket Pipelines
+    if os.getenv("BITBUCKET_COMMIT"):
+        return "Bitbucket Pipelines"
+
+    # AppVeyor
+    if os.getenv("APPVEYOR") == "True":
+        return "AppVeyor"
+
+    # Azure Pipelines
+    if os.getenv("AZURE_PIPELINES"):
+        return "Azure Pipelines"
+
+    # Default to None if none of the CI variables are set
+    return None
+
+
 def is_confident():
     confident_api_key = KEY_FILE_HANDLER.fetch_data(KeyValues.API_KEY)
     return confident_api_key is not None

From ddf5735c332bc14187e6ae47a057f60c9a2f5fd2 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Fri, 26 Jan 2024 02:34:11 -0800
Subject: [PATCH 50/74] Added retrieval Context

---
 deepeval/dataset/api.py     |  2 +-
 deepeval/dataset/dataset.py |  2 --
 deepeval/utils.py           | 11 +++++++++++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/deepeval/dataset/api.py b/deepeval/dataset/api.py
index c758dd3f6..641967fda 100644
--- a/deepeval/dataset/api.py
+++ b/deepeval/dataset/api.py
@@ -7,7 +7,7 @@ class Golden(BaseModel):
     actual_output: Optional[str] = Field(None, alias="actualOutput")
     expected_output: Optional[str] = Field(None, alias="expectedOutput")
     context: Optional[list] = Field(None)
-    retrieval_context: Optional[list] = Field(None)
+    retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
     additional_metadata: Optional[Dict] = Field(
         None, alias="additionalMetadata"
     )
diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index 516d27114..045992d2c 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -288,8 +288,6 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
                 goldens=result["goldens"],
             )
 
-            self.goldens = response.goldens
-
             if auto_convert_goldens_to_test_cases:
                 self.test_cases = convert_goldens_to_test_cases(self.goldens)
         else:
diff --git a/deepeval/utils.py b/deepeval/utils.py
index e7fc08745..614107407 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -29,6 +29,17 @@ def get_ci_env():
 
     # GitHub Actions
     if os.getenv("GITHUB_ACTIONS") == "true":
+        actor = os.getenv("GITHUB_ACTOR")
+        branch_ref = os.getenv("GITHUB_REF")
+        commit_sha = os.getenv("GITHUB_SHA")
+        repo_slug = os.getenv("GITHUB_REPOSITORY")
+
+        # For branch name, especially for pull requests
+        if branch_ref.startswith("refs/pull/"):
+            pr_number = branch_ref.split("/")[2]
+            branch_name = f"PR-{pr_number}"
+        else:
+            branch_name = branch_ref.replace("refs/heads/", "")
         return "GitHub Actions"
 
     # Jenkins

From 58993da3f5fdd12b953f20cb713c53372cb436bc Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Fri, 26 Jan 2024 23:29:00 -0800
Subject: [PATCH 51/74] Added deployment configs

---
 deepeval/cli/test.py          |  9 ++----
 deepeval/plugins/plugin.py    | 15 ++++++---
 deepeval/test_run/test_run.py |  7 +++--
 deepeval/utils.py             | 57 +++++++++--------------------------
 4 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 1664356fa..34964c76d 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -1,6 +1,7 @@
 import pytest
 import typer
 import os
+import json
 from typing_extensions import Annotated
 from typing import Optional
 from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
@@ -47,9 +48,6 @@ def run(
         "-n",
         help="Number of processes to use with pytest",
     ),
-    deployment: bool = typer.Option(
-        False, "-d", "--deployment", help="Flag to indicate deployment"
-    ),
 ):
     """Run a test"""
     delete_file_if_exists(TEMP_FILE_NAME)
@@ -61,9 +59,8 @@ def run(
 
     ci_env = get_ci_env()
     if ci_env is not None:
-        pytest_args.extend(["--deployment", ci_env])
-    elif deployment:
-        pytest_args.extend(["--deployment", ""])
+        ci_env_json = json.dumps(ci_env)
+        pytest_args.extend(["--deployment", ci_env_json])
 
     pytest_args.extend(
         [
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 67231e011..48c729a63 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -1,5 +1,6 @@
 import pytest
 import os
+import json
 from rich import print
 from typing import Optional, Any
 from deepeval.constants import PYTEST_RUN_TEST_NAME
@@ -9,10 +10,16 @@
 def pytest_sessionstart(session: pytest.Session):
     test_run_manager.save_to_disk = True
     try:
-        deployment = session.config.getoption("--deployment")
+        deployment_configs = session.config.getoption("--deployment")
+        if deployment_configs is None:
+            deployment = False
+        else:
+            deployment = True
+            deployment_configs = json.loads(deployment_configs)
+
         test_run_manager.create_test_run(
-            # TODO: change to deployment
-            deployment=False,
+            deployment=deployment,
+            deployment_configs=deployment_configs,
             file_name=session.config.getoption("file_or_dir")[0],
         )
     except:
@@ -24,7 +31,7 @@ def pytest_addoption(parser):
         "--deployment",
         action="store",
         default=None,
-        help="Set deployment mode (optionally provide a string value)",
+        help="Set deployment configs",
     )
 
 
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 83426549d..a3078d237 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -66,6 +66,7 @@ class TestRun(BaseModel):
     )
     # TODO: change to Optional[str]
     deployment: Optional[bool] = Field(True)
+    deployment_configs: Optional[Dict] = Field(None, alias="deploymentConfigs")
     dict_test_cases: Dict[int, APITestCase] = Field(
         default_factory=dict,
     )
@@ -160,8 +161,8 @@ def set_test_run(self, test_run: TestRun):
 
     def create_test_run(
         self,
-        # TODO: change to Optional[str]
         deployment: Optional[bool] = False,
+        deployment_configs: Optional[Dict] = False,
         file_name: Optional[str] = None,
     ):
         test_run = TestRun(
@@ -170,6 +171,7 @@ def create_test_run(
             metricScores=[],
             configurations={},
             deployment=deployment,
+            deploymentConfigs=deployment_configs,
         )
         self.set_test_run(test_run)
 
@@ -292,7 +294,8 @@ def post_test_run(self, test_run: TestRun):
                     "✅ Tests finished! View results on "
                     f"[link={link}]{link}[/link]"
                 )
-                webbrowser.open(link)
+                if test_run.deployment == False:
+                    webbrowser.open(link)
         else:
             console.print(
                 '✅ Tests finished! Run "deepeval login" to view evaluation results on the web.'
diff --git a/deepeval/utils.py b/deepeval/utils.py
index 614107407..eafed5728 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -2,7 +2,7 @@
 import copy
 import os
 import time
-from typing import Any
+from typing import Any, Optional, Dict
 from collections.abc import Iterable
 import tqdm
 import re
@@ -14,51 +14,24 @@
 from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
 
 
-def get_ci_env():
-    # CircleCI
-    if os.getenv("CIRCLECI") == "true":
-        return "CircleCI"
-
-    # Travis CI
-    if os.getenv("TRAVIS") == "true":
-        return "Travis CI"
-
-    # GitLab CI
-    if os.getenv("GITLAB_CI") == "true":
-        return "GitLab CI"
-
-    # GitHub Actions
+def get_ci_env() -> Optional[Dict]:
     if os.getenv("GITHUB_ACTIONS") == "true":
-        actor = os.getenv("GITHUB_ACTOR")
-        branch_ref = os.getenv("GITHUB_REF")
-        commit_sha = os.getenv("GITHUB_SHA")
-        repo_slug = os.getenv("GITHUB_REPOSITORY")
-
-        # For branch name, especially for pull requests
+        env_info = {
+            "env": "GitHub Actions",
+            "actor": os.getenv("GITHUB_ACTOR", None),
+            "sha": os.getenv("GITHUB_SHA", None),
+            "repo": os.getenv("GITHUB_REPOSITORY", None),
+        }
+
+        branch_ref = os.getenv("GITHUB_REF", "")
         if branch_ref.startswith("refs/pull/"):
-            pr_number = branch_ref.split("/")[2]
-            branch_name = f"PR-{pr_number}"
-        else:
-            branch_name = branch_ref.replace("refs/heads/", "")
-        return "GitHub Actions"
-
-    # Jenkins
-    if os.getenv("JENKINS_URL"):
-        return "Jenkins"
-
-    # Bitbucket Pipelines
-    if os.getenv("BITBUCKET_COMMIT"):
-        return "Bitbucket Pipelines"
-
-    # AppVeyor
-    if os.getenv("APPVEYOR") == "True":
-        return "AppVeyor"
+            return None
 
-    # Azure Pipelines
-    if os.getenv("AZURE_PIPELINES"):
-        return "Azure Pipelines"
+        env_info["branch"] = (
+            branch_ref.replace("refs/heads/", "") if branch_ref else None
+        )
+        return env_info
 
-    # Default to None if none of the CI variables are set
     return None
 
 

From bcbe21edfdf43f871611d795b7f221ad07cf0b03 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Fri, 26 Jan 2024 23:35:42 -0800
Subject: [PATCH 52/74] Added deployment key

---
 .github/workflows/deepeval-results.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/deepeval-results.yml b/.github/workflows/deepeval-results.yml
index 5f77643af..c05abf087 100644
--- a/.github/workflows/deepeval-results.yml
+++ b/.github/workflows/deepeval-results.yml
@@ -43,6 +43,11 @@ jobs:
       - name: Check if 'deepeval' script is available
         run: ls -l $(poetry env info --path)/bin/deepeval || echo "deepeval script not found"
 
+      - name: Run deepeval login
+        env:
+          CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
+        run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"
+
       - name: Run deepeval tests and capture output
         run: poetry run deepeval test run tests/test_quickstart.py > output.txt 2>&1
 

From 7c3e546adb42809d3163abef3953a51867ee9cdd Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Fri, 26 Jan 2024 23:45:51 -0800
Subject: [PATCH 53/74] Debug

---
 deepeval/cli/main.py | 1 +
 deepeval/utils.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py
index 78596596b..4744c722f 100644
--- a/deepeval/cli/main.py
+++ b/deepeval/cli/main.py
@@ -32,6 +32,7 @@ def login(
     ),
 ):
     # Use the confident_api_key if it is provided, otherwise proceed with existing logic
+    print(confident_api_key)
     if confident_api_key:
         api_key = confident_api_key
     else:
diff --git a/deepeval/utils.py b/deepeval/utils.py
index eafed5728..fa3d6a3ec 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -24,6 +24,7 @@ def get_ci_env() -> Optional[Dict]:
         }
 
         branch_ref = os.getenv("GITHUB_REF", "")
+        print(branch_ref, "@@@@@@@@@@@@@@@@@@@@@@")
         if branch_ref.startswith("refs/pull/"):
             return None
 

From ebe574c4288630d3261f63f01a5b93eb4370751d Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sat, 27 Jan 2024 00:05:10 -0800
Subject: [PATCH 54/74] debug

---
 deepeval/cli/main.py       | 1 -
 deepeval/cli/test.py       | 2 ++
 deepeval/plugins/plugin.py | 1 +
 deepeval/utils.py          | 1 +
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py
index 4744c722f..78596596b 100644
--- a/deepeval/cli/main.py
+++ b/deepeval/cli/main.py
@@ -32,7 +32,6 @@ def login(
     ),
 ):
     # Use the confident_api_key if it is provided, otherwise proceed with existing logic
-    print(confident_api_key)
     if confident_api_key:
         api_key = confident_api_key
     else:
diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 34964c76d..2ec815f39 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -58,7 +58,9 @@ def run(
         pytest_args.insert(0, "-x")
 
     ci_env = get_ci_env()
+    print(ci_env, "@@@@@@@@@@@@@@@@@@@@@@")
     if ci_env is not None:
+        print(ci_env, "CI ENV SHOULD BE NONE")
         ci_env_json = json.dumps(ci_env)
         pytest_args.extend(["--deployment", ci_env_json])
 
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 48c729a63..179e69b8e 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -11,6 +11,7 @@ def pytest_sessionstart(session: pytest.Session):
     test_run_manager.save_to_disk = True
     try:
         deployment_configs = session.config.getoption("--deployment")
+        print(deployment_configs, "LAST CHECK @@@@@@@@@@")
         if deployment_configs is None:
             deployment = False
         else:
diff --git a/deepeval/utils.py b/deepeval/utils.py
index fa3d6a3ec..68594e3b4 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -26,6 +26,7 @@ def get_ci_env() -> Optional[Dict]:
         branch_ref = os.getenv("GITHUB_REF", "")
         print(branch_ref, "@@@@@@@@@@@@@@@@@@@@@@")
         if branch_ref.startswith("refs/pull/"):
+            print(branch_ref, "!!!!!!!!!!!!!!!!!!")
             return None
 
         env_info["branch"] = (

From 6550f51722c7e27804bc28cdc74169af7dfc1b49 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sat, 27 Jan 2024 00:31:25 -0800
Subject: [PATCH 55/74] debug

---
 deepeval/cli/test.py          | 12 +++++-------
 deepeval/plugins/plugin.py    | 12 ++++++++++--
 deepeval/test_run/__init__.py |  7 ++++++-
 deepeval/test_run/test_run.py | 16 ++++++++++++++--
 deepeval/utils.py             |  9 +++++----
 5 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/deepeval/cli/test.py b/deepeval/cli/test.py
index 2ec815f39..3e2e5edf1 100644
--- a/deepeval/cli/test.py
+++ b/deepeval/cli/test.py
@@ -5,7 +5,7 @@
 from typing_extensions import Annotated
 from typing import Optional
 from deepeval.test_run import test_run_manager, TEMP_FILE_NAME
-from deepeval.utils import delete_file_if_exists, get_ci_env
+from deepeval.utils import delete_file_if_exists, get_deployment_configs
 from deepeval.test_run import invoke_test_run_end_hook
 from deepeval.telemetry import capture_evaluation_count
 
@@ -57,12 +57,10 @@ def run(
     if exit_on_first_failure:
         pytest_args.insert(0, "-x")
 
-    ci_env = get_ci_env()
-    print(ci_env, "@@@@@@@@@@@@@@@@@@@@@@")
-    if ci_env is not None:
-        print(ci_env, "CI ENV SHOULD BE NONE")
-        ci_env_json = json.dumps(ci_env)
-        pytest_args.extend(["--deployment", ci_env_json])
+    deployment_configs = get_deployment_configs()
+    if deployment_configs is not None:
+        deployment_configs_json = json.dumps(deployment_configs)
+        pytest_args.extend(["--deployment", deployment_configs_json])
 
     pytest_args.extend(
         [
diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index 179e69b8e..c85cf6730 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -4,24 +4,32 @@
 from rich import print
 from typing import Optional, Any
 from deepeval.constants import PYTEST_RUN_TEST_NAME
-from deepeval.test_run import test_run_manager
+from deepeval.test_run import test_run_manager, DeploymentConfigs
 
 
 def pytest_sessionstart(session: pytest.Session):
     test_run_manager.save_to_disk = True
     try:
         deployment_configs = session.config.getoption("--deployment")
-        print(deployment_configs, "LAST CHECK @@@@@@@@@@")
+        disable_request = False
+
         if deployment_configs is None:
             deployment = False
         else:
             deployment = True
             deployment_configs = json.loads(deployment_configs)
+            disable_request = deployment_configs.pop("is_pull_request", False)
+            deployment_configs = DeploymentConfigs(**deployment_configs)
+
+        print("@@@@@@@@@@@")
+        print(deployment_configs)
+        print(disable_request)
 
         test_run_manager.create_test_run(
             deployment=deployment,
             deployment_configs=deployment_configs,
             file_name=session.config.getoption("file_or_dir")[0],
+            disable_request=disable_request,
         )
     except:
         test_run_manager.create_test_run()
diff --git a/deepeval/test_run/__init__.py b/deepeval/test_run/__init__.py
index 07c3ed0ce..96648fcdb 100644
--- a/deepeval/test_run/__init__.py
+++ b/deepeval/test_run/__init__.py
@@ -1,2 +1,7 @@
-from .test_run import TestRun, test_run_manager, TEMP_FILE_NAME
+from .test_run import (
+    TestRun,
+    test_run_manager,
+    TEMP_FILE_NAME,
+    DeploymentConfigs,
+)
 from .hooks import on_test_run_end, invoke_test_run_end_hook
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index a3078d237..98b1a111c 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -36,6 +36,14 @@ def from_metric(cls, metric: BaseMetric):
         return cls(metric=metric.__name__, score=metric.score)
 
 
+class DeploymentConfigs(BaseModel):
+    env: str
+    actor: Optional[str]
+    branch: Optional[str]
+    sha: Optional[str]
+    repo: Optional[str]
+
+
 class MetricsAverageDict:
     def __init__(self):
         self.metric_dict = {}
@@ -150,11 +158,13 @@ def __init__(self):
         self.test_run = None
         self.temp_file_name = TEMP_FILE_NAME
         self.save_to_disk = False
+        self.disable_request = False
 
     def reset(self):
         self.test_run = None
         self.temp_file_name = TEMP_FILE_NAME
         self.save_to_disk = False
+        self.disable_request = False
 
     def set_test_run(self, test_run: TestRun):
         self.test_run = test_run
@@ -162,9 +172,11 @@ def set_test_run(self, test_run: TestRun):
     def create_test_run(
         self,
         deployment: Optional[bool] = False,
-        deployment_configs: Optional[Dict] = False,
+        deployment_configs: Optional[DeploymentConfigs] = False,
         file_name: Optional[str] = None,
+        disable_request: Optional[bool] = False,
     ):
+        self.disable_request = disable_request
         test_run = TestRun(
             testFile=file_name,
             testCases=[],
@@ -272,7 +284,7 @@ def post_test_run(self, test_run: TestRun):
         for test_case in test_run.test_cases:
             test_case.id = None
 
-        if is_confident():
+        if is_confident() and self.disable_request is False:
             try:
                 body = test_run.model_dump(by_alias=True, exclude_none=True)
             except AttributeError:
diff --git a/deepeval/utils.py b/deepeval/utils.py
index 68594e3b4..25e8ee8d6 100644
--- a/deepeval/utils.py
+++ b/deepeval/utils.py
@@ -14,7 +14,7 @@
 from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
 
 
-def get_ci_env() -> Optional[Dict]:
+def get_deployment_configs() -> Optional[Dict]:
     if os.getenv("GITHUB_ACTIONS") == "true":
         env_info = {
             "env": "GitHub Actions",
@@ -24,11 +24,12 @@ def get_ci_env() -> Optional[Dict]:
         }
 
         branch_ref = os.getenv("GITHUB_REF", "")
-        print(branch_ref, "@@@@@@@@@@@@@@@@@@@@@@")
         if branch_ref.startswith("refs/pull/"):
-            print(branch_ref, "!!!!!!!!!!!!!!!!!!")
-            return None
+            is_pull_request = True
+        else:
+            is_pull_request = False
 
+        env_info["is_pull_request"] = is_pull_request
         env_info["branch"] = (
             branch_ref.replace("refs/heads/", "") if branch_ref else None
         )

From ffa3ffc3675d5c95e7c83bb775712bfa030d3c2b Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sat, 27 Jan 2024 00:36:58 -0800
Subject: [PATCH 56/74] .

---
 deepeval/test_run/test_run.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 98b1a111c..82e715549 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -72,9 +72,10 @@ class TestRun(BaseModel):
         None,
         alias="testFile",
     )
-    # TODO: change to Optional[str]
     deployment: Optional[bool] = Field(True)
-    deployment_configs: Optional[Dict] = Field(None, alias="deploymentConfigs")
+    deployment_configs: Optional[DeploymentConfigs] = Field(
+        None, alias="deploymentConfigs"
+    )
     dict_test_cases: Dict[int, APITestCase] = Field(
         default_factory=dict,
     )

From 9ca77f6b9ecf785bed3a75f7db6a48be032f8ddd Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sat, 27 Jan 2024 00:40:08 -0800
Subject: [PATCH 57/74] .

---
 deepeval/plugins/plugin.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/deepeval/plugins/plugin.py b/deepeval/plugins/plugin.py
index c85cf6730..968e7aa97 100644
--- a/deepeval/plugins/plugin.py
+++ b/deepeval/plugins/plugin.py
@@ -21,10 +21,6 @@ def pytest_sessionstart(session: pytest.Session):
             disable_request = deployment_configs.pop("is_pull_request", False)
             deployment_configs = DeploymentConfigs(**deployment_configs)
 
-        print("@@@@@@@@@@@")
-        print(deployment_configs)
-        print(disable_request)
-
         test_run_manager.create_test_run(
             deployment=deployment,
             deployment_configs=deployment_configs,

From c8942cb880f484a9b95aa2938c54292865c03545 Mon Sep 17 00:00:00 2001
From: Pratyush-exe <pratyush21225@gmail.com>
Date: Sun, 28 Jan 2024 12:04:44 +0530
Subject: [PATCH 58/74] Added basic docs

---
 docs/docs/integrations-huggingface.mdx | 113 +++++++++++++++++++++++++
 docs/sidebars.js                       |  39 ++++-----
 2 files changed, 133 insertions(+), 19 deletions(-)
 create mode 100644 docs/docs/integrations-huggingface.mdx

diff --git a/docs/docs/integrations-huggingface.mdx b/docs/docs/integrations-huggingface.mdx
new file mode 100644
index 000000000..3920c3550
--- /dev/null
+++ b/docs/docs/integrations-huggingface.mdx
@@ -0,0 +1,113 @@
+---
+# id: integrations-hugginface
+title: DeepEvalCallback
+sidebar_label: DeepEvalCallback
+---
+
+## Quick Summary
+
+`DeepEvalHuggingFaceCallback` is a custom huggingface's `transformers.TrainerCallback` for in-depth evaluation of LLM/LM models during training/fine-tuning using `transformers.Trainer`. 
+
+## Usage
+
+### Importing the Necessary Components
+
+```python
+from transformers import (
+    Seq2SeqTrainer,
+    Seq2SeqTrainingArguments,
+    T5Tokenizer,
+    T5ForConditionalGeneration,
+    DataCollatorForSeq2Seq,
+)
+
+from datasets import load_dataset
+
+from deepeval.integrations import DeepEvalHuggingFaceCallback
+from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
+from deepeval.dataset import EvaluationDataset, Golden
+```
+
+### Initializing Metrics and Evaluation Dataset
+
+```python
+# Define evaluation metrics
+hallucination_metric = HallucinationMetric(threshold=0.3)
+answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
+metrics = [hallucination_metric, answer_relevancy_metric]
+
+# Define goldens and eval_dataset
+goldens = [Golden(...), Golden(...), Golden(...)]
+eval_dataset = EvaluationDataset(goldens=goldens)
+```
+
+### Initialize `transformers` Trainer and Tokenizer
+```python
+
+# Load training Dataset
+training_dataset = load_dataset('DATASET')
+
+# Initalize tokenizer and model
+tokenizer = T5Tokenizer.from_pretrained("MODEL-ID")
+model = T5ForConditionalGeneration.from_pretrained("MODEL-ID")
+
+tokenizer_args = {...}
+```
+### Initalize `transformers.Trainer`
+```python
+# Define training args
+training_args = Seq2SeqTrainingArguments(
+    output_dir="OUTPUT-DIR",
+    overwrite_output_dir=True,
+    num_train_epochs=50,
+    per_device_train_batch_size=8,
+)
+
+# Create Trainer instance (Seq2SeqTrainer is a child of Trainer)
+trainer = Seq2SeqTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=training_args,
+    train_dataset=training_dataset,
+)
+```
+
+### Initalize `DeepeEvalCallback` and begin Training
+```python
+callback = DeepEvalHuggingFaceCallback(
+    metrics=metrics,
+    evaluation_dataset=eval_dataset,
+    tokenizer_args=tokenizer_args,
+    trainer=trainer,
+    show_table=True,
+    show_table_every=1,
+)
+
+# Add the callback to the Trainer
+trainer.add_callback(callback)
+
+# Start model training
+trainer.train()
+```
+
+## Reference
+
+### `DeepEvalHuggingFaceCallback` Class
+
+#### Attributes
+
+- **`show_table`**: Flag indicating whether to display a table with evaluation metric scores.
+- **`show_table_every`**: Frequency of displaying the evaluation table.
+- **`metrics`**: Evaluation metrics used during training.
+- **`evaluation_dataset`**: Dataset for evaluation.
+- **`tokenizer_args`**: Arguments for the tokenizer.
+- **`aggregation_method`**: Method for aggregating metric scores for multiple Goldens.
+- **`trainer`**:  transformers.trainer instance.
+
+#### Methods
+
+- **`on_epoch_begin`**: Triggered at the beginning of each training epoch.
+- **`on_epoch_end`**: Triggered at the end of each training epoch.
+- **`on_log`**: Triggered after logging the last logs.
+- **`on_train_end`**: Triggered at the end of model training.
+- **`on_train_begin`**: Triggered at the beginning of model training.
\ No newline at end of file
diff --git a/docs/sidebars.js b/docs/sidebars.js
index 8809f489c..fe8d0a6db 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -4,7 +4,7 @@ module.exports = {
       type: 'category',
       label: 'Getting Started',
       items: [
-          'getting-started',
+        'getting-started',
       ],
       collapsed: false,
     },
@@ -19,23 +19,23 @@ module.exports = {
           type: 'category',
           label: 'Metrics',
           items: [
-              'metrics-introduction',
-              'metrics-llm-evals',
-              'metrics-summarization',
-              'metrics-answer-relevancy',
-              'metrics-faithfulness',
-              'metrics-contextual-precision',
-              'metrics-contextual-relevancy',
-              'metrics-contextual-recall',
-              'metrics-ragas',
-              'metrics-latency',
-              'metrics-cost',
-              'metrics-hallucination',
-              'metrics-bias',
-              'metrics-toxicity',
-              'metrics-judgemental',
-              'metrics-custom',
-              'metrics-others',
+            'metrics-introduction',
+            'metrics-llm-evals',
+            'metrics-summarization',
+            'metrics-answer-relevancy',
+            'metrics-faithfulness',
+            'metrics-contextual-precision',
+            'metrics-contextual-relevancy',
+            'metrics-contextual-recall',
+            'metrics-ragas',
+            'metrics-latency',
+            'metrics-cost',
+            'metrics-hallucination',
+            'metrics-bias',
+            'metrics-toxicity',
+            'metrics-judgemental',
+            'metrics-custom',
+            'metrics-others',
           ],
           collapsed: false,
         },
@@ -60,7 +60,8 @@ module.exports = {
       label: 'Integrations',
       items: [
         'integrations-introduction',
-        'integrations-llamaindex'
+        'integrations-llamaindex',
+        'integrations-huggingface'
       ],
       collapsed: false,
     },

From 9e463bb0911b9945892d46ea0963afc005541766 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 28 Jan 2024 23:15:44 -0800
Subject: [PATCH 59/74] Added custom model

---
 deepeval/metrics/answer_relevancy.py     | 15 +++++----
 deepeval/metrics/contextual_precision.py | 13 +++++---
 deepeval/metrics/contextual_recall.py    | 13 +++++---
 deepeval/metrics/contextual_relevancy.py | 13 +++++---
 deepeval/metrics/faithfulness.py         | 15 +++++----
 deepeval/metrics/g_eval.py               | 13 +++++---
 deepeval/metrics/ragas.py                |  2 +-
 deepeval/metrics/summarization.py        | 13 +++++---
 deepeval/models/gpt_model.py             |  9 +++---
 docs/docs/integrations-huggingface.mdx   | 16 ++++++----
 docs/docs/metrics-introduction.mdx       | 40 ++++++++++++++++++++++--
 tests/test_rag_metrics.py                |  2 +-
 12 files changed, 112 insertions(+), 52 deletions(-)

diff --git a/deepeval/metrics/answer_relevancy.py b/deepeval/metrics/answer_relevancy.py
index 679ab81ca..1b89ac87f 100644
--- a/deepeval/metrics/answer_relevancy.py
+++ b/deepeval/metrics/answer_relevancy.py
@@ -6,7 +6,7 @@
 from deepeval.utils import trimToJson
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.templates import AnswerRelevancyTemplate
 from deepeval.progress_context import metrics_progress_context
 
@@ -20,11 +20,14 @@ class AnswerRelevancyMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         include_reason: bool = True,
     ):
         self.threshold = threshold
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.include_reason = include_reason
         self.n = 5
@@ -85,7 +88,7 @@ def _generate_reason(
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def _generate_verdicts(
         self, original_question: str
@@ -95,7 +98,7 @@ def _generate_verdicts(
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]]
 
@@ -115,7 +118,7 @@ def _generate_key_points(
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         return data["key_points"]
 
diff --git a/deepeval/metrics/contextual_precision.py b/deepeval/metrics/contextual_precision.py
index a548b9fbb..f3b22e406 100644
--- a/deepeval/metrics/contextual_precision.py
+++ b/deepeval/metrics/contextual_precision.py
@@ -6,7 +6,7 @@
 from deepeval.utils import trimToJson
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.templates import ContextualPrecisionTemplate
 from deepeval.progress_context import metrics_progress_context
 
@@ -21,12 +21,15 @@ class ContextualPrecisionMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         include_reason: bool = True,
     ):
         self.threshold = threshold
         self.include_reason = include_reason
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase) -> float:
@@ -82,7 +85,7 @@ def _generate_reason(self, input: str, score: float):
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def _generate_score(self):
         # Convert verdicts to a binary list where 'yes' is 1 and others are 0
@@ -122,7 +125,7 @@ def _generate_verdicts(
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         verdicts = [
             ContextualPrecisionVerdict(**item) for item in data["verdicts"]
diff --git a/deepeval/metrics/contextual_recall.py b/deepeval/metrics/contextual_recall.py
index 56485cc93..b4a2bd45f 100644
--- a/deepeval/metrics/contextual_recall.py
+++ b/deepeval/metrics/contextual_recall.py
@@ -6,7 +6,7 @@
 from deepeval.utils import trimToJson
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.templates import ContextualRecallTemplate
 from deepeval.progress_context import metrics_progress_context
 
@@ -20,11 +20,14 @@ class ContextualRecallMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         include_reason: bool = True,
     ):
         self.threshold = threshold
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.include_reason = include_reason
         self.n = 5
@@ -76,7 +79,7 @@ def _generate_reason(self, expected_output: str, score: float):
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def _generate_score(self):
         if len(self.verdicts) == 0:
@@ -96,7 +99,7 @@ def _generate_verdicts(
             expected_output=expected_output, retrieval_context=retrieval_context
         )
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         verdicts = [
             ContextualRecallVerdict(**item) for item in data["verdicts"]
diff --git a/deepeval/metrics/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy.py
index 38badc1e7..1ef8ea496 100644
--- a/deepeval/metrics/contextual_relevancy.py
+++ b/deepeval/metrics/contextual_relevancy.py
@@ -7,7 +7,7 @@
 from deepeval.utils import trimToJson
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.templates import ContextualRelevancyTemplate
 from deepeval.progress_context import metrics_progress_context
 
@@ -21,11 +21,14 @@ class ContextualRelevancyMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         include_reason: bool = True,
     ):
         self.threshold = threshold
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.include_reason = include_reason
 
@@ -73,7 +76,7 @@ def _generate_reason(self, input: str, score: float):
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def _generate_score(self):
         irrelevant_sentences = 0
@@ -103,7 +106,7 @@ def _generate_verdicts(
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         verdicts = [
             ContextualRelevancyVerdict(**item) for item in data["verdicts"]
diff --git a/deepeval/metrics/faithfulness.py b/deepeval/metrics/faithfulness.py
index d019b79b4..8d923b15e 100644
--- a/deepeval/metrics/faithfulness.py
+++ b/deepeval/metrics/faithfulness.py
@@ -7,7 +7,7 @@
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
 from deepeval.utils import trimToJson
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.templates import FaithfulnessTemplate
 from deepeval.progress_context import metrics_progress_context
 
@@ -22,11 +22,14 @@ class FaithfulnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         include_reason: bool = True,
     ):
         self.threshold = threshold
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.include_reason = include_reason
 
@@ -85,7 +88,7 @@ def _generate_reason(self, score: float):
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def _generate_truths(
         self,
@@ -95,7 +98,7 @@ def _generate_truths(
     ):
         prompt = FaithfulnessTemplate.generate_truths(text=context)
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         truths = data["truths"]
 
@@ -134,7 +137,7 @@ def _generate_verdicts(
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
         verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]]
 
diff --git a/deepeval/metrics/g_eval.py b/deepeval/metrics/g_eval.py
index 62081e36d..ce60c492d 100644
--- a/deepeval/metrics/g_eval.py
+++ b/deepeval/metrics/g_eval.py
@@ -9,7 +9,7 @@
     evaluation_results_template,
 )
 from deepeval.utils import trimToJson
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 
 from pydantic import BaseModel
 
@@ -26,7 +26,7 @@ def __init__(
         evaluation_params: List[LLMTestCaseParams],
         criteria: Optional[str] = None,
         evaluation_steps: Optional[List[str]] = None,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         threshold: float = 0.5,
     ):
         self.name = name
@@ -49,7 +49,10 @@ def __init__(
             )
 
         self.criteria = criteria
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.evaluation_steps = evaluation_steps
         self.threshold = threshold
@@ -87,7 +90,7 @@ def generate_evaluation_steps(self):
 
         res = self.model(prompt)
 
-        return res.content
+        return res
 
     def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]:
         text = """"""
@@ -102,7 +105,7 @@ def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]:
         )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
 
         return data["score"], data["reason"]
diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py
index b5c26e7fc..154dbebcf 100644
--- a/deepeval/metrics/ragas.py
+++ b/deepeval/metrics/ragas.py
@@ -19,7 +19,7 @@ class RAGASContextualPrecisionMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = None,
     ):
         self.threshold = threshold
         self.model = GPTModel(model=model)
diff --git a/deepeval/metrics/summarization.py b/deepeval/metrics/summarization.py
index 21195277a..b4b21f953 100644
--- a/deepeval/metrics/summarization.py
+++ b/deepeval/metrics/summarization.py
@@ -6,7 +6,7 @@
 
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics import BaseMetric
-from deepeval.models import GPTModel
+from deepeval.models import GPTModel, DeepEvalBaseModel
 from deepeval.utils import trimToJson
 from deepeval.templates import (
     closed_end_questions_template,
@@ -24,12 +24,15 @@ class SummarizationMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.5,
-        model: Optional[Union[str, BaseChatModel]] = None,
+        model: Optional[Union[str, DeepEvalBaseModel, BaseChatModel]] = None,
         n: Optional[int] = 5,
         assessment_questions: Optional[List[str]] = None,
     ):
         self.threshold = threshold
-        self.model = GPTModel(model=model)
+        if isinstance(model, DeepEvalBaseModel):
+            self.model = model
+        else:
+            self.model = GPTModel(model=model)
         self.evaluation_model = self.model.get_model_name()
         self.assessment_questions = assessment_questions
         self.n = n
@@ -126,7 +129,7 @@ def generate_questions(
             )
 
         res = self.model(prompt)
-        json_output = trimToJson(res.content)
+        json_output = trimToJson(res)
         data = json.loads(json_output)
 
         return data["questions"]
@@ -137,7 +140,7 @@ def get_answer(self, question: str, text: str) -> str:
         )
 
         res = self.model(prompt)
-        return res.content
+        return res
 
     def is_successful(self) -> bool:
         self.success = self.score >= self.threshold
diff --git a/deepeval/models/gpt_model.py b/deepeval/models/gpt_model.py
index b17f42c15..5ba2038de 100644
--- a/deepeval/models/gpt_model.py
+++ b/deepeval/models/gpt_model.py
@@ -1,10 +1,9 @@
-import os
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
 from langchain_core.language_models import BaseChatModel
 from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER
-from deepeval.models.base import DeepEvalBaseModel
+from deepeval.models import DeepEvalBaseModel
 from deepeval.chat_completion.retry import retry_with_exponential_backoff
 
 valid_gpt_models = [
@@ -81,9 +80,9 @@ def load_model(self):
         return ChatOpenAI(model_name=self.model_name)
 
     @retry_with_exponential_backoff
-    def _call(self, prompt: str):
+    def _call(self, prompt: str) -> str:
         chat_model = self.load_model()
-        return chat_model.invoke(prompt)
+        return chat_model.invoke(prompt).content
 
     def should_use_azure_openai(self):
         value = KEY_FILE_HANDLER.fetch_data(KeyValues.USE_AZURE_OPENAI)
diff --git a/docs/docs/integrations-huggingface.mdx b/docs/docs/integrations-huggingface.mdx
index 3920c3550..d5ee2198a 100644
--- a/docs/docs/integrations-huggingface.mdx
+++ b/docs/docs/integrations-huggingface.mdx
@@ -1,12 +1,12 @@
 ---
-# id: integrations-hugginface
-title: DeepEvalCallback
-sidebar_label: DeepEvalCallback
+# id: integrations-huggingface
+title: Hugging Face
+sidebar_label: Hugging Face
 ---
 
 ## Quick Summary
 
-`DeepEvalHuggingFaceCallback` is a custom huggingface's `transformers.TrainerCallback` for in-depth evaluation of LLM/LM models during training/fine-tuning using `transformers.Trainer`. 
+`DeepEvalHuggingFaceCallback` is a custom huggingface's `transformers.TrainerCallback` for in-depth evaluation of LLM/LM models during training/fine-tuning using `transformers.Trainer`.
 
 ## Usage
 
@@ -42,6 +42,7 @@ eval_dataset = EvaluationDataset(goldens=goldens)
 ```
 
 ### Initialize `transformers` Trainer and Tokenizer
+
 ```python
 
 # Load training Dataset
@@ -53,7 +54,9 @@ model = T5ForConditionalGeneration.from_pretrained("MODEL-ID")
 
 tokenizer_args = {...}
 ```
+
 ### Initalize `transformers.Trainer`
+
 ```python
 # Define training args
 training_args = Seq2SeqTrainingArguments(
@@ -73,6 +76,7 @@ trainer = Seq2SeqTrainer(
 ```
 
 ### Initalize `DeepeEvalCallback` and begin Training
+
 ```python
 callback = DeepEvalHuggingFaceCallback(
     metrics=metrics,
@@ -102,7 +106,7 @@ trainer.train()
 - **`evaluation_dataset`**: Dataset for evaluation.
 - **`tokenizer_args`**: Arguments for the tokenizer.
 - **`aggregation_method`**: Method for aggregating metric scores for multiple Goldens.
-- **`trainer`**:  transformers.trainer instance.
+- **`trainer`**: transformers.trainer instance.
 
 #### Methods
 
@@ -110,4 +114,4 @@ trainer.train()
 - **`on_epoch_end`**: Triggered at the end of each training epoch.
 - **`on_log`**: Triggered after logging the last logs.
 - **`on_train_end`**: Triggered at the end of model training.
-- **`on_train_begin`**: Triggered at the beginning of model training.
\ No newline at end of file
+- **`on_train_begin`**: Triggered at the beginning of model training.
diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx
index 58e2618e4..8f0387628 100644
--- a/docs/docs/metrics-introduction.mdx
+++ b/docs/docs/metrics-introduction.mdx
@@ -79,19 +79,55 @@ deepeval unset-azure-openai
 
 We highly discourage the use of custom LLMs since evaluation requires a high level of reasoning capabilities that we find are generally not reachable apart from (Azure) OpenAI's GPT models.
 
-But to use a custom LLM for evaluation, `deepeval` metrics currently supports all of langchain's [Chat Models](https://python.langchain.com/docs/integrations/chat/), which you can provide through the `model` argument when instantiating an LLM-based metric:
+That being said,`deepeval` allows you to use **ANY** custom LLM for evaluation. This includes LLMs through langchain's [Chat Models](https://python.langchain.com/docs/integrations/chat/), or even LLMs in GGML format, which you can specify through the `model` argument when instantiating an LLM-based metric. Here is an example of using a custom Azure OpenAI model through langchain's `AzureChatOpenAI` interface for evaluation:
 
 ```python
 from langchain_openai import AzureChatOpenAI
+from deepeval.models.base import DeepEvalBaseModel
 
+class CustomEvaluationModel(DeepEvalBaseModel):
+    def __init__(
+        self,
+        model
+    ):
+        self.model = model
+
+    def load_model(self):
+        return self.model
+
+    def _call(self, prompt: str) -> str:
+        chat_model = self.load_model()
+        return chat_model.invoke(prompt)
+
+    def get_model_name(self):
+        return "Custom Azure OpenAI Model"
+
+# Replace these with real values
 custom_azure_openai_model = AzureChatOpenAI(
     openai_api_version=openai_api_version,
     azure_deployment=azure_deployment,
     azure_endpoint=azure_endpoint,
     openai_api_key=openai_api_key,
 )
+custom_evaluation_model = CustomEvaluationModel(model=chat_model)
+print(custom_model("Write me a joke"))
+```
+
+Remember, when creating a custom LLM evaluation model you should always:
+
+- inherit `DeepEvalBaseModel`.
+- implement the `load_model()` method, which will be responsible for returning a model object.
+- implement the `_call()` method with **one and only one** parameter of type string that acts as the prompt to your custom LLM.
+- the `_call()` method should return the final output string of your custom LLM. Note that we called `chat_model.invoke(prompt)` in this particular example, but this could be different depending on the implementation of your custom LLM object.
+- the `get_model_name()` method simply returns a string representing the name of your LLM model.
+
+Note that the `model` argument in the `__init__()` method can accept any type (the model string or object itself). Lastly, to use it for evaluation in LLM-based metrics:
+
+```python
+from deepeval.metrics import AnswerRelevancyMetric
+...
 
-answer_relevancy_metric = AnswerRelevancyMetric(model=custom_azure_openai_model)
+metric = AnswerRelevancyMetric(model=custom_evaluation_model)
 ```
 
 :::note
diff --git a/tests/test_rag_metrics.py b/tests/test_rag_metrics.py
index 19a626388..b12969ab7 100644
--- a/tests/test_rag_metrics.py
+++ b/tests/test_rag_metrics.py
@@ -18,4 +18,4 @@ def test_answer_relevancy_again():
     )
     relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
     faithfulness_metric = FaithfulnessMetric(threshold=0.5)
-    assert_test(test_case, [faithfulness_metric])
+    assert_test(test_case, [faithfulness_metric, relevancy_metric])

From b28728329144e7b75559a33011f5a7a96ea4c44e Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sun, 28 Jan 2024 23:22:12 -0800
Subject: [PATCH 60/74] new release

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index 1f97ae6e7..c495fd97f 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.53"
+__version__: str = "0.20.54"
diff --git a/pyproject.toml b/pyproject.toml
index 9125de7f3..0c00f9ebb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.53"
+version = "0.20.54"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From 45d64f354b34f017705aa0bcaae48a2cd4758ce2 Mon Sep 17 00:00:00 2001
From: Yves Junqueira <yves.junqueira@gmail.com>
Date: Mon, 29 Jan 2024 11:44:42 +0000
Subject: [PATCH 61/74] Fix package setup

---
 deepeval/integrations/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 deepeval/integrations/__init__.py

diff --git a/deepeval/integrations/__init__.py b/deepeval/integrations/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 4c3076bd4deeadd4a8cf9d3d1f6cc86bdd12ecf3 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 09:37:37 -0800
Subject: [PATCH 62/74] FIx evaluate

---
 deepeval/test_run/test_run.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 82e715549..c09cc8ba5 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -173,7 +173,7 @@ def set_test_run(self, test_run: TestRun):
     def create_test_run(
         self,
         deployment: Optional[bool] = False,
-        deployment_configs: Optional[DeploymentConfigs] = False,
+        deployment_configs: Optional[DeploymentConfigs] = None,
         file_name: Optional[str] = None,
         disable_request: Optional[bool] = False,
     ):

From 995103a8a1df03dc8033c0f2c973b4f9e09bab1b Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 09:38:50 -0800
Subject: [PATCH 63/74] new release

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index c495fd97f..ee8034727 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.54"
+__version__: str = "0.20.55"
diff --git a/pyproject.toml b/pyproject.toml
index 0c00f9ebb..7dc861b9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.54"
+version = "0.20.55"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From 6566679329848b640a13443c72571c577f5fcc25 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 16:12:49 -0800
Subject: [PATCH 64/74] Added confident cost latency logging

---
 deepeval/test_run/api.py                       | 2 ++
 deepeval/test_run/test_run.py                  | 2 ++
 tests/test_cost.py                             | 9 ---------
 tests/test_hallucination_metric.py             | 6 ++++++
 tests/{test_latency.py => test_performance.py} | 8 +++++++-
 5 files changed, 17 insertions(+), 10 deletions(-)
 delete mode 100644 tests/test_cost.py
 rename tests/{test_latency.py => test_performance.py} (56%)

diff --git a/deepeval/test_run/api.py b/deepeval/test_run/api.py
index 4e45c9ef8..a01c40cbd 100644
--- a/deepeval/test_run/api.py
+++ b/deepeval/test_run/api.py
@@ -21,6 +21,8 @@ class APITestCase(BaseModel):
         ..., alias="metricsMetadata"
     )
     run_duration: float = Field(..., alias="runDuration")
+    latency: Optional[float] = Field(None)
+    cost: Optional[float] = Field(None)
     traceStack: Optional[dict] = Field(None)
     context: Optional[list] = Field(None)
     retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index c09cc8ba5..689578765 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -127,6 +127,8 @@ def add_llm_test_case(
                 success=metric.is_successful(),
                 metricsMetadata=[metric_metadata],
                 runDuration=run_duration,
+                latency=test_case.latency,
+                cost=test_case.cost,
                 context=test_case.context,
                 retrievalContext=test_case.retrieval_context,
                 traceStack=get_trace_stack(),
diff --git a/tests/test_cost.py b/tests/test_cost.py
deleted file mode 100644
index ca9b6ae8d..000000000
--- a/tests/test_cost.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from deepeval.metrics import CostMetric
-from deepeval.test_case import LLMTestCase
-from deepeval import assert_test
-
-
-def test_cost_metric():
-    metric = CostMetric(threshold=12)
-    test_case = LLMTestCase(input="...", actual_output="...", cost=12)
-    assert_test(test_case, [metric])
diff --git a/tests/test_hallucination_metric.py b/tests/test_hallucination_metric.py
index 13560d23c..f1d2df6a5 100644
--- a/tests/test_hallucination_metric.py
+++ b/tests/test_hallucination_metric.py
@@ -12,6 +12,8 @@ def test_hallucination_metric():
         context=[
             "A man with blond-hair, and a brown shirt drinking out of a public water fountain."
         ],
+        cost=0.4,
+        latency=2,
     )
     assert_test(test_case, [metric])
 
@@ -22,6 +24,8 @@ def test_hallucination_metric_2():
         input="placeholder",
         actual_output="Python is a programming language.",
         context=["Python is NOT a programming language."],
+        cost=1,
+        latency=0.2,
     )
     with pytest.raises(AssertionError):
         assert_test(test_case, [metric])
@@ -33,6 +37,8 @@ def test_hallucination_metric_3():
         input="placeholder",
         actual_output="Python is a programming language.",
         context=["Python is a snake."],
+        cost=0.1,
+        latency=13.0,
     )
     with pytest.raises(AssertionError):
         assert_test(test_case, [metric])
diff --git a/tests/test_latency.py b/tests/test_performance.py
similarity index 56%
rename from tests/test_latency.py
rename to tests/test_performance.py
index c8fc05dd5..43219bae4 100644
--- a/tests/test_latency.py
+++ b/tests/test_performance.py
@@ -1,8 +1,14 @@
-from deepeval.metrics import LatencyMetric
+from deepeval.metrics import LatencyMetric, CostMetric
 from deepeval.test_case import LLMTestCase
 from deepeval import assert_test
 
 
+def test_cost_metric():
+    metric = CostMetric(threshold=12)
+    test_case = LLMTestCase(input="...", actual_output="...", cost=12)
+    assert_test(test_case, [metric])
+
+
 def test_latency_metric():
     metric = LatencyMetric(threshold=12)
     test_case = LLMTestCase(

From ab08118e82c986478fb739b7e450b2fa61933151 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 18:12:47 -0800
Subject: [PATCH 65/74] Added dataset alias for confidnet

---
 deepeval/dataset/dataset.py   | 23 ++++++++++++++++++++---
 deepeval/dataset/utils.py     |  7 +++++--
 deepeval/test_case.py         |  2 ++
 deepeval/test_run/test_run.py |  4 ++++
 tests/test_dataset.py         | 26 +++++++++++++++++++++-----
 5 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py
index 045992d2c..bc931ca8f 100644
--- a/deepeval/dataset/dataset.py
+++ b/deepeval/dataset/dataset.py
@@ -3,7 +3,6 @@
 from rich.console import Console
 import json
 import webbrowser
-import os
 
 from deepeval.metrics import BaseMetric
 from deepeval.test_case import LLMTestCase
@@ -28,11 +27,18 @@ class EvaluationDataset:
 
     def __init__(
         self,
+        alias: Optional[str] = None,
         goldens: Optional[List[Golden]] = None,
         test_cases: Optional[List[LLMTestCase]] = None,
     ):
-        self.test_cases = test_cases or []
+        if test_cases is not None:
+            for test_case in test_cases:
+                test_case.dataset_alias = alias
+            self.test_cases = test_cases
+        else:
+            self.test_cases = []
         self.goldens = goldens or []
+        self.alias = alias
 
     def add_test_case(self, test_case: LLMTestCase):
         self.test_cases.append(test_case)
@@ -43,6 +49,11 @@ def __iter__(self):
     def evaluate(self, metrics: List[BaseMetric]):
         from deepeval import evaluate
 
+        if len(self.test_cases) == 0:
+            raise ValueError(
+                "No test cases found in evaluation dataset. Unable to evaluate empty dataset."
+            )
+
         return evaluate(self.test_cases, metrics)
 
     def add_test_cases_from_csv_file(
@@ -113,6 +124,7 @@ def get_column_data(df: pd.DataFrame, col_name: str, default=None):
                     actual_output=actual_output,
                     expected_output=expected_output,
                     context=context,
+                    dataset_alias=self.alias,
                 )
             )
 
@@ -175,6 +187,7 @@ def add_test_cases_from_json_file(
                     actual_output=actual_output,
                     expected_output=expected_output,
                     context=context,
+                    dataset_alias=self.alias,
                 )
             )
 
@@ -242,6 +255,7 @@ def add_test_cases_from_hf_dataset(
                     actual_output=actual_output,
                     expected_output=expected_output,
                     context=context,
+                    dataset_alias=self.alias,
                 )
             )
 
@@ -278,6 +292,7 @@ def push(self, alias: str):
 
     def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
         if is_confident():
+            self.alias = alias
             api = Api()
             result = api.get_request(
                 endpoint=Endpoints.DATASET_ENDPOINT.value,
@@ -289,7 +304,9 @@ def pull(self, alias: str, auto_convert_goldens_to_test_cases: bool = True):
             )
 
             if auto_convert_goldens_to_test_cases:
-                self.test_cases = convert_goldens_to_test_cases(self.goldens)
+                self.test_cases = convert_goldens_to_test_cases(
+                    response.goldens, alias
+                )
         else:
             raise Exception(
                 "Run `deepeval login` to pull dataset from Confident AI"
diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py
index 7a9e039fc..48f27ce8e 100644
--- a/deepeval/dataset/utils.py
+++ b/deepeval/dataset/utils.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import List, Optional
 from deepeval.dataset.api import Golden
 from deepeval.test_case import LLMTestCase
 
@@ -18,7 +18,9 @@ def convert_test_cases_to_goldens(
     return goldens
 
 
-def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
+def convert_goldens_to_test_cases(
+    goldens: List[Golden], dataset_alias: Optional[str] = None
+) -> List[LLMTestCase]:
     test_cases = []
     for golden in goldens:
         test_case = LLMTestCase(
@@ -27,6 +29,7 @@ def convert_goldens_to_test_cases(goldens: List[Golden]) -> List[LLMTestCase]:
             expected_output=golden.expected_output,
             context=golden.context,
             retrieval_context=golden.retrieval_context,
+            dataset_alias=dataset_alias,
         )
         test_cases.append(test_case)
     return test_cases
diff --git a/deepeval/test_case.py b/deepeval/test_case.py
index 49b20954e..9c8ee7772 100644
--- a/deepeval/test_case.py
+++ b/deepeval/test_case.py
@@ -22,6 +22,7 @@ def __init__(
         retrieval_context: Optional[List[str]] = None,
         latency: Optional[float] = None,
         cost: Optional[float] = None,
+        dataset_alias: Optional[str] = None,
         id: Optional[str] = None,
     ):
         self.id = id
@@ -32,3 +33,4 @@ def __init__(
         self.retrieval_context = retrieval_context
         self.latency = latency
         self.cost = cost
+        self.dataset_alias = dataset_alias
diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py
index 689578765..190ebc794 100644
--- a/deepeval/test_run/test_run.py
+++ b/deepeval/test_run/test_run.py
@@ -72,6 +72,7 @@ class TestRun(BaseModel):
         None,
         alias="testFile",
     )
+    dataset_alias: Optional[str] = Field(None, alias="datasetAlias")
     deployment: Optional[bool] = Field(True)
     deployment_configs: Optional[DeploymentConfigs] = Field(
         None, alias="deploymentConfigs"
@@ -94,6 +95,9 @@ def add_llm_test_case(
         run_duration: float,
         index: int,
     ):
+        # Set database alias if exists on test case
+        self.dataset_alias = test_case.dataset_alias
+
         # Check if test case with the same ID already exists
         test_case_id = id(test_case)
         existing_test_case: APITestCase = self.dict_test_cases.get(
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index d99ba8e98..d932c7bed 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -3,13 +3,12 @@
 import pytest
 from deepeval.dataset import EvaluationDataset
 from deepeval.metrics import HallucinationMetric
-from deepeval import assert_test
+from deepeval import assert_test, evaluate
 from deepeval.test_case import LLMTestCase
 
-dataset = EvaluationDataset()
-
 
 def test_create_dataset():
+    dataset = EvaluationDataset()
     module_b_dir = os.path.dirname(os.path.realpath(__file__))
 
     file_path = os.path.join(module_b_dir, "data", "dataset.csv")
@@ -32,10 +31,27 @@ def test_create_dataset():
     )
     assert len(dataset.test_cases) == 10, "Test Cases not loaded from JSON"
 
-    # dataset.push("alias")
+
+# test_case = LLMTestCase(
+#     input="What if these shoes don't fit?",
+#     # Replace this with the actual output from your LLM application
+#     actual_output="We offer a 30-day full refund at no extra costs.",
+#     context=["All customers are eligible for a 30 day full refund at no extra costs."]
+# )
+# dataset = EvaluationDataset(alias="123", test_cases=[test_case])
+
+# @pytest.mark.parametrize(
+#     "test_case",
+#     dataset,
+# )
+# def test_test_dataset(test_case: LLMTestCase):
+#     metric = HallucinationMetric(threshold=0.5)
+#     assert_test(test_case, [metric])
 
 
-# dataset.pull("alias")
+# dataset = EvaluationDataset()
+# dataset.pull("Testa")
+# print(dataset.test_cases)
 # @pytest.mark.parametrize(
 #     "test_case",
 #     dataset,

From 4c64ed8d02c4780f04c132709d95d091aac1eb2f Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 18:16:38 -0800
Subject: [PATCH 66/74] Fix convert goldens

---
 deepeval/integrations/hugging_face/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepeval/integrations/hugging_face/utils.py b/deepeval/integrations/hugging_face/utils.py
index a0b6a9c5b..46eb84583 100644
--- a/deepeval/integrations/hugging_face/utils.py
+++ b/deepeval/integrations/hugging_face/utils.py
@@ -49,5 +49,5 @@ def generate_test_cases(
         decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
         golden.actual_output = decoded_output
 
-    test_cases = convert_goldens_to_test_cases(evaluation_dataset.goldens)
+    test_cases = convert_goldens_to_test_cases(goldens=evaluation_dataset.goldens, dataset_alias=evaluation_dataset.alias)
     return test_cases

From 73e9eb35eedfecfc3411e57679f889eaa85f691f Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 18:25:59 -0800
Subject: [PATCH 67/74] Fix cost and latency thresholds

---
 deepeval/integrations/hugging_face/utils.py |  5 ++++-
 deepeval/metrics/cost.py                    |  4 ++--
 deepeval/metrics/latency.py                 |  4 ++--
 docs/docs/evaluation-test-cases.mdx         | 12 +++++++-----
 docs/docs/metrics-cost.mdx                  |  6 +++---
 docs/docs/metrics-latency.mdx               |  6 +++---
 tests/test_performance.py                   |  4 ++--
 7 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/deepeval/integrations/hugging_face/utils.py b/deepeval/integrations/hugging_face/utils.py
index 46eb84583..fde55fb44 100644
--- a/deepeval/integrations/hugging_face/utils.py
+++ b/deepeval/integrations/hugging_face/utils.py
@@ -49,5 +49,8 @@ def generate_test_cases(
         decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
         golden.actual_output = decoded_output
 
-    test_cases = convert_goldens_to_test_cases(goldens=evaluation_dataset.goldens, dataset_alias=evaluation_dataset.alias)
+    test_cases = convert_goldens_to_test_cases(
+        goldens=evaluation_dataset.goldens,
+        dataset_alias=evaluation_dataset.alias,
+    )
     return test_cases
diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py
index 642feef5c..8c2da03e5 100644
--- a/deepeval/metrics/cost.py
+++ b/deepeval/metrics/cost.py
@@ -3,8 +3,8 @@
 
 
 class CostMetric(BaseMetric):
-    def __init__(self, threshold: float):
-        self.threshold = threshold
+    def __init__(self, max_cost: float):
+        self.threshold = max_cost
 
     def measure(self, test_case: LLMTestCase):
         self.success = test_case.cost <= self.threshold
diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py
index a37017a05..6bb2e531e 100644
--- a/deepeval/metrics/latency.py
+++ b/deepeval/metrics/latency.py
@@ -3,8 +3,8 @@
 
 
 class LatencyMetric(BaseMetric):
-    def __init__(self, threshold: float):
-        self.threshold = threshold
+    def __init__(self, max_latency: float):
+        self.threshold = max_latency
 
     def measure(self, test_case: LLMTestCase):
         self.success = test_case.latency <= self.threshold
diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx
index c857962ac..6b2f8b10a 100644
--- a/docs/docs/evaluation-test-cases.mdx
+++ b/docs/docs/evaluation-test-cases.mdx
@@ -182,13 +182,14 @@ The `latency` is an **optional** parameter that represents how long it took your
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
-    # Replace this with the actual latency of your LLM application
+    # Replace this with the actual latency it took your
+    # LLM (application) to generate the actual output
     latency=10.4
 )
 ```
 
-:::note
-`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `latency` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `latency` parameter.
+:::info
+The only `deepeval` metric that uses the `latency` parameter is the [`LatencyMetric`.](metrics-latency)
 :::
 
 ## Cost
@@ -199,13 +200,14 @@ The `cost` is an **optional** parameter that represents the token cost for a giv
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
-    # Replace this with the actual latency of your LLM application
+    # Replace this with the actual cost it took your
+    # LLM (application) to generate the actual output
     cost=0.78
 )
 ```
 
 :::info
-`deepeval` does not offer cost and latency metrics because it is difficult to account for all different units and currencies available. We highly encourage you to look at the [custom metrics section](metrics-custom#implementation) for a full example on how to create your own metric if you are looking to evaluate cost and latency.
+Similar to the `LatencyMetric`, the [`CostMetric`](metrics-cost) is the only `deepeval` metric that uses the `cost` parameter.
 :::
 
 ## Run A Test Case
diff --git a/docs/docs/metrics-cost.mdx b/docs/docs/metrics-cost.mdx
index 5923ebdf8..880f5ff86 100644
--- a/docs/docs/metrics-cost.mdx
+++ b/docs/docs/metrics-cost.mdx
@@ -21,7 +21,7 @@ from deepeval import evaluate
 from deepeval.metrics import CostMetric
 from deepeval.test_case import LLMTestCase
 
-metric = CostMetric(threshold=0.4)
+metric = CostMetric(max_cost=0.4)
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
@@ -29,10 +29,10 @@ test_case = LLMTestCase(
 )
 
 metric.measure(test_case)
-# True if cost <= threshold
+# True if cost <= max_cost
 print(metric.is_successful())
 ```
 
 :::note
-Similar to `LatencyMetric`, the `CostMetric` threshold does **NOT** have any standard units. However, you need to make sure the monetary units you provide in the `cost` argument when creating an `LLMTestCase` matches that of the cost `threshold`.
+Similar to `LatencyMetric`, the `CostMetric` threshold, `max_cost`, does **NOT** have any standard units. However, you need to make sure the monetary units you provide in the `cost` argument when creating an `LLMTestCase` matches that of the cost `max_cost`.
 :::
diff --git a/docs/docs/metrics-latency.mdx b/docs/docs/metrics-latency.mdx
index fcf3688f7..c3785719e 100644
--- a/docs/docs/metrics-latency.mdx
+++ b/docs/docs/metrics-latency.mdx
@@ -25,7 +25,7 @@ from deepeval import evaluate
 from deepeval.metrics import LatencyMetric
 from deepeval.test_case import LLMTestCase
 
-metric = LatencyMetric(threshold=10.0)
+metric = LatencyMetric(max_latency=10.0)
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
@@ -33,12 +33,12 @@ test_case = LLMTestCase(
 )
 
 metric.measure(test_case)
-# True if latency <= threshold
+# True if latency <= max_latency
 print(metric.is_successful())
 ```
 
 :::note
 
-It does not matter what unit of time you provide the `threshold` argument with, it only has to match the unit of `latency` when creating an `LLMTestCase`.
+It does not matter what unit of time you provide the `max_latency` argument with, it only has to match the unit of `latency` when creating an `LLMTestCase`.
 
 :::
diff --git a/tests/test_performance.py b/tests/test_performance.py
index 43219bae4..a3be64baa 100644
--- a/tests/test_performance.py
+++ b/tests/test_performance.py
@@ -4,13 +4,13 @@
 
 
 def test_cost_metric():
-    metric = CostMetric(threshold=12)
+    metric = CostMetric(max_cost=12)
     test_case = LLMTestCase(input="...", actual_output="...", cost=12)
     assert_test(test_case, [metric])
 
 
 def test_latency_metric():
-    metric = LatencyMetric(threshold=12)
+    metric = LatencyMetric(max_latency=12)
     test_case = LLMTestCase(
         input="...",
         actual_output="...",

From 6c9d0567c45e20058473468b48ee4a5304d1e06a Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 18:27:55 -0800
Subject: [PATCH 68/74] updated docs

---
 docs/docs/metrics-latency.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/metrics-latency.mdx b/docs/docs/metrics-latency.mdx
index c3785719e..ccde3f3e6 100644
--- a/docs/docs/metrics-latency.mdx
+++ b/docs/docs/metrics-latency.mdx
@@ -7,7 +7,7 @@ sidebar_label: Latency
 The latency metric measures whether the completion time of your LLM (application) is efficient and meets the expected time limits. It is one of the two performance metrics offered by `deepeval`.
 
 :::info
-Performance metrics in `deepeval` are metrics that evaluate aspects such as latency and cost, rather than the outputs of LLM (applications).
+Performance metrics in `deepeval` are metrics that evaluate aspects such as latency and cost, rather than the outputs of your LLM (application).
 :::
 
 ## Required Arguments

From c7ccd87850a511d399d752bb1e5256dedeab1f71 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 18:31:28 -0800
Subject: [PATCH 69/74] .

---
 deepeval/_version.py | 2 +-
 pyproject.toml       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepeval/_version.py b/deepeval/_version.py
index ee8034727..c8a7aba1a 100644
--- a/deepeval/_version.py
+++ b/deepeval/_version.py
@@ -1 +1 @@
-__version__: str = "0.20.55"
+__version__: str = "0.20.56"
diff --git a/pyproject.toml b/pyproject.toml
index 7dc861b9e..f3ccfdbf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "deepeval"
-version = "0.20.55"
+version = "0.20.56"
 description = "The Evaluation Framework for LLMs"
 authors = ["Jeffrey Ip <jeffreyip@confident-ai.com>"]
 license = "Apache-2.0"

From bead5c027fffbd31e83b4a4e60b85fe47aa6bdc1 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Mon, 29 Jan 2024 20:13:09 -0800
Subject: [PATCH 70/74] Added docs

---
 docs/docs/confident-ai-github-actions.mdx | 46 +++++++++++++++++++++++
 docs/sidebars.js                          |  1 +
 2 files changed, 47 insertions(+)
 create mode 100644 docs/docs/confident-ai-github-actions.mdx

diff --git a/docs/docs/confident-ai-github-actions.mdx b/docs/docs/confident-ai-github-actions.mdx
new file mode 100644
index 000000000..6b347e6c7
--- /dev/null
+++ b/docs/docs/confident-ai-github-actions.mdx
@@ -0,0 +1,46 @@
+---
+id: confident-ai-github-actions
+title: Evals in GitHub Actions
+sidebar_label: Evals in GitHub Actions
+---
+
+## Quick Summary
+
+Confident AI allows you to monitor evaluation results in CI/CD pipelines using GitHub Actions, specifically on pushes to the repository. To set this up, simply execute `deepeval test run` within your workflow defined in a YAML file located in the `.github/workflows/` directory of your GitHub repository.
+
+:::info
+Confident is currently integrated with GitHub Actions.
+:::
+
+## Setup Evals for GitHub Actions
+
+`deepeval` tracks evaluations ran in GitHub Actions for push events only. To begin, define an evaluation dataset/test cases in a test file and execute it via `deepeval test run` in a GitHub workflow YAML file:
+
+```yaml title=".github/workflows/llm-evaluations.yml"
+name: LLM Deployment Evaluations
+
+# Make sure to include push events
+on:
+  push:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+        # Some extra steps to setup and install dependencies
+        ...
+
+      - name: Login to Confident
+        env:
+          CONFIDENT_API_KEY: ${{ secrets.CONFIDENT_API_KEY }}
+        run: poetry run deepeval login --confident-api-key "$CONFIDENT_API_KEY"
+
+      - name: Run deepeval tests
+        run: poetry run deepeval test run test_file.py
+```
+
+:::note
+Your workflow file does **NOT** have to be same as the example shown above. In the example, we used poetry and GitHub secrets to store and access our API key, which is not a strict requirement.
+:::
+
+**Congratulations!** With this setup, `deepeval` will automatically log evaluation results will be automatically logged to your project's deployments page on Confident AI for each push event.
diff --git a/docs/sidebars.js b/docs/sidebars.js
index fe8d0a6db..6b2f2b369 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -51,6 +51,7 @@ module.exports = {
         'confident-ai-evaluate-datasets',
         'confident-ai-analyze-evaluations',
         'confident-ai-debug-evaluations',
+        'confident-ai-github-actions',
         'confident-ai-evals-in-production'
       ],
       collapsed: false,

From d3939c90d53891ec089c7a4a89cd3213836f7bb4 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 31 Jan 2024 15:24:57 -0800
Subject: [PATCH 71/74] Updated docs

---
 docs/docs/confident-ai-analyze-evaluations.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/confident-ai-analyze-evaluations.mdx b/docs/docs/confident-ai-analyze-evaluations.mdx
index f8f2f6af1..bbeb32cc5 100644
--- a/docs/docs/confident-ai-analyze-evaluations.mdx
+++ b/docs/docs/confident-ai-analyze-evaluations.mdx
@@ -13,7 +13,7 @@ Confident AI keeps track of your evaluation histories in both development and de
 
 ## Visualize Evaluation Results
 
-Once logged in via `deepeval login`, all evaluations executed using `deepeval test run`, `evaluate(dataset, metrics)`, or `dataset.evaluate(metrics)`, will automatically have their results available on Confident.
+Once logged in via `deepeval login`, all evaluations executed using `deepeval test run`, `evaluate(...)`, or `dataset.evaluate(...)`, will automatically have their results available on Confident.
 
 ![ok](https://d2lsxfc3p6r9rv.cloudfront.net/confident-test-cases.png)
 

From 8c99c56a225a3dc1888b13514c8039a4af5017cc Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Wed, 31 Jan 2024 17:59:37 -0800
Subject: [PATCH 72/74] udpated docs

---
 docs/docs/confident-ai-github-actions.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/confident-ai-github-actions.mdx b/docs/docs/confident-ai-github-actions.mdx
index 6b347e6c7..9870204ee 100644
--- a/docs/docs/confident-ai-github-actions.mdx
+++ b/docs/docs/confident-ai-github-actions.mdx
@@ -43,4 +43,4 @@ jobs:
 Your workflow file does **NOT** have to be same as the example shown above. In the example, we used poetry and GitHub secrets to store and access our API key, which is not a strict requirement.
 :::
 
-**Congratulations!** With this setup, `deepeval` will automatically log evaluation results will be automatically logged to your project's deployments page on Confident AI for each push event.
+**Congratulations!** With this setup, `deepeval` will automatically log evaluation results to your project's deployments page on Confident AI.

From 49b165b5c4a4fb49344d7571afda172ab01d2bc6 Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Fri, 2 Feb 2024 16:09:40 -0800
Subject: [PATCH 73/74] updated docs

---
 docs/docs/metrics-ragas.mdx | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/docs/metrics-ragas.mdx b/docs/docs/metrics-ragas.mdx
index 9b0537ec5..808b0125a 100644
--- a/docs/docs/metrics-ragas.mdx
+++ b/docs/docs/metrics-ragas.mdx
@@ -56,3 +56,16 @@ There are three optional parameters when creating a `RagasMetric`:
 
 - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
 - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-3.5-turbo'.
+
+:::note
+You can also choose to import and evaluate using each metric individually:
+
+```python
+from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric
+from deepeval.metrics.ragas import RAGASFaithfulnessMetric
+from deepeval.metrics.ragas import RAGASContextualRecallMetric
+from deepeval.metrics.ragas import RAGASContextualPrecisionMetric
+```
+
+These metrics accept the same arguments as the `RagasMetric`.
+:::

From 0728b191897e2ed172a121d3632dec834930969d Mon Sep 17 00:00:00 2001
From: Jeffrey Ip <jeffreyip@confident-ai.com>
Date: Sat, 3 Feb 2024 03:27:58 -0800
Subject: [PATCH 74/74] Fixed docs

---
 docs/docs/evaluation-test-cases.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx
index 6b2f8b10a..d12b280f9 100644
--- a/docs/docs/evaluation-test-cases.mdx
+++ b/docs/docs/evaluation-test-cases.mdx
@@ -347,4 +347,4 @@ metric = HallucinationMetric(threshold=0.7)
 evaluate(test_cases, [metric])
 ```
 
-Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evalute`, visit the [datasets section](evaluation-datasets).
+Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evaluate`, visit the [datasets section](evaluation-datasets).