Merge pull request #417 from confident-ai/hotfix/separate-ragas

Separated Ragas
confident-ai · Jan 19, 2024 · bbf8726 · bbf8726
2 parents d6d6b81 + 02904d5
commit bbf8726
Show file tree

Hide file tree

Showing 13 changed files with 163 additions and 146 deletions.
diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py
@@ -144,8 +144,8 @@ def print_test_result(test_result: TestResult):
             print(
                 f"  - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})"
             )
-        if metric.score_metadata:
-            for metric_name, score in metric.score_metadata.items():
+        if metric.score_breakdown:
+            for metric_name, score in metric.score_breakdown.items():
                 print(f"      - {metric_name} (score: {score})")
 
     print("\nFor test case:\n")

diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -11,19 +11,20 @@
 from .contextual_precision import ContextualPrecisionMetric
 from .latency import LatencyMetric
 from .cost import CostMetric
-from .ragas_metric import (
-    RagasMetric,
-    RAGASAnswerRelevancyMetric,
-    RAGASFaithfulnessMetric,
-    RAGASContextualRecallMetric,
-    RAGASContextualRelevancyMetric,
-    RAGASContextualPrecisionMetric,
-    RAGASAnswerRelevancyMetric,
-    RAGASConcisenessMetric as ConcisenessMetric,
-    RAGASCorrectnessMetric as CorrectnessMetric,
-    RAGASCoherenceMetric as CoherenceMetric,
-    RAGASMaliciousnessMetric as MaliciousnessMetric,
-)
+
+# from .ragas_metric import (
+#     RagasMetric,
+#     RAGASAnswerRelevancyMetric,
+#     RAGASFaithfulnessMetric,
+#     RAGASContextualRecallMetric,
+#     RAGASContextualRelevancyMetric,
+#     RAGASContextualPrecisionMetric,
+#     RAGASAnswerRelevancyMetric,
+#     RAGASConcisenessMetric as ConcisenessMetric,
+#     RAGASCorrectnessMetric as CorrectnessMetric,
+#     RAGASCoherenceMetric as CoherenceMetric,
+#     RAGASMaliciousnessMetric as MaliciousnessMetric,
+# )
 from .unbias_metric import UnBiasedMetric
 from .non_toxic_metric import NonToxicMetric
 from .hallucination_metric import HallucinationMetric
diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py
@@ -6,7 +6,7 @@
 
 class BaseMetric:
     score: float = 0
-    score_metadata: Dict = None
+    score_breakdown: Dict = None
     reason: Optional[str] = None
     evaluation_model: Optional[str] = None
 

diff --git a/deepeval/metrics/ragas_metric.py → deepeval/metrics/ragas.py b/deepeval/metrics/ragas_metric.py → deepeval/metrics/ragas.py
@@ -22,7 +22,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -41,7 +42,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         context_precision.llm = LangchainLLM(llm=chat_model)
 
         # Create a dataset from the test case
@@ -79,7 +80,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -98,7 +100,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         context_relevancy.llm = LangchainLLM(llm=chat_model)
 
         # Create a dataset from the test case
@@ -135,7 +137,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -154,7 +157,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         answer_relevancy.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -185,7 +188,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -204,7 +208,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         faithfulness.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -237,7 +241,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -256,7 +261,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         context_recall.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -280,7 +285,7 @@ def __name__(self):
         return format_ragas_metric_name("Contextual Recall")
 
 
-class RAGASHarmfulnessMetric(BaseMetric):
+class HarmfulnessMetric(BaseMetric):
     """This metric checks the harmfulness using Ragas"""
 
     def __init__(
@@ -289,7 +294,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -308,7 +314,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         harmfulness.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -333,7 +339,7 @@ def __name__(self):
         return "Harmfulness"
 
 
-class RAGASCoherenceMetric(BaseMetric):
+class CoherenceMetric(BaseMetric):
     """This metric checks the coherence using Ragas"""
 
     def __init__(
@@ -342,7 +348,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         try:
@@ -359,7 +366,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         coherence.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -384,7 +391,7 @@ def __name__(self):
         return "Coherence"
 
 
-class RAGASMaliciousnessMetric(BaseMetric):
+class MaliciousnessMetric(BaseMetric):
     """This metric checks the maliciousness using Ragas"""
 
     def __init__(
@@ -393,7 +400,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         try:
@@ -411,7 +419,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         maliciousness.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -436,7 +444,7 @@ def __name__(self):
         return "Maliciousness"
 
 
-class RAGASCorrectnessMetric(BaseMetric):
+class CorrectnessMetric(BaseMetric):
     """This metric checks the correctness using Ragas"""
 
     def __init__(
@@ -445,7 +453,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         try:
@@ -463,7 +472,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         correctness.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -488,7 +497,7 @@ def __name__(self):
         return "Correctness"
 
 
-class RAGASConcisenessMetric(BaseMetric):
+class ConcisenessMetric(BaseMetric):
     """This metric checks the conciseness using Ragas"""
 
     def __init__(
@@ -497,7 +506,8 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         try:
@@ -514,7 +524,7 @@ def measure(self, test_case: LLMTestCase):
             raise ModuleNotFoundError("Please install dataset")
 
         # Set LLM model
-        chat_model = GPTModel(self.model).load_model()
+        chat_model = self.model.load_model()
         conciseness.llm = LangchainLLM(llm=chat_model)
 
         data = {
@@ -548,7 +558,9 @@ def __init__(
         model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
-        self.model = model
+        self.model_name = model
+        self.model = GPTModel(model=model)
+        self.evaluation_model = self.model.get_model_name()
 
     def measure(self, test_case: LLMTestCase):
         # sends to server
@@ -567,37 +579,23 @@ def measure(self, test_case: LLMTestCase):
 
         # Create a dataset from the test case
         # Convert the LLMTestCase to a format compatible with Dataset
-        score_metadata = {}
+        score_breakdown = {}
         metrics = [
-            RAGASContextualPrecisionMetric(model=self.model),
-            RAGASContextualRecallMetric(model=self.model),
-            RAGASFaithfulnessMetric(model=self.model),
-            RAGASAnswerRelevancyMetric(model=self.model),
+            RAGASContextualPrecisionMetric(model=self.model_name),
+            RAGASContextualRecallMetric(model=self.model_name),
+            RAGASFaithfulnessMetric(model=self.model_name),
+            RAGASAnswerRelevancyMetric(model=self.model_name),
         ]
 
-        warnings_list = []
-
         for metric in metrics:
             score = metric.measure(test_case)
-            score_metadata[metric.__name__] = score
-            if score == 0:
-                warnings_list.append(
-                    f"The RAGAS score will be 0 since {metric.__name__} has a score of 0"
-                )
-
-        for warning in warnings_list:
-            print(warning)
-
-        if any(score == 0 for score in score_metadata.values()):
-            ragas_score = 0
-        else:
-            ragas_score = len(score_metadata) / sum(
-                1.0 / score for score in score_metadata.values()
-            )
+            score_breakdown[metric.__name__] = score
+
+        ragas_score = sum(score_breakdown.values()) / len(score_breakdown)
 
         self.success = ragas_score >= self.threshold
         self.score = ragas_score
-        self.score_metadata = score_metadata
+        self.score_breakdown = score_breakdown
         return self.score
 
     def is_successful(self):

diff --git a/deepeval/metrics/summarization.py b/deepeval/metrics/summarization.py
@@ -57,7 +57,7 @@ def measure(self, test_case: LLMTestCase):
         summarization_score = min(alignment_score, inclusion_score)
 
         self.success = summarization_score >= self.threshold
-        self.score_metadata = {
+        self.score_breakdown = {
             "Alignment": alignment_score,
             "Inclusion": inclusion_score,
         }

diff --git a/docs/docs/metrics-answer-relevancy.mdx b/docs/docs/metrics-answer-relevancy.mdx
@@ -55,5 +55,5 @@ There are three optional parameters when creating an `AnswerRelevancyMetric`:
 You can also choose to fallback to Ragas' answer relevancy metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python
-from deepeval.metrics import RAGASAnswerRelevancyMetric
+from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric
 ```
diff --git a/docs/docs/metrics-contextual-precision.mdx b/docs/docs/metrics-contextual-precision.mdx
@@ -60,5 +60,5 @@ There are three optional parameters when creating a `ContextualPrecisionMetric`:
 You can also choose to fallback to Ragas' contextual precision metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python
-from deepeval.metrics import RAGASContextualPrecisionMetric
+from deepeval.metrics.ragas import RAGASContextualPrecisionMetric
 ```
diff --git a/docs/docs/metrics-contextual-recall.mdx b/docs/docs/metrics-contextual-recall.mdx
@@ -60,5 +60,5 @@ There are three optional parameters when creating a `ContextualRecallMetric`:
 You can also choose to fallback to Ragas' contextual recall metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python
-from deepeval.metrics import RAGASContextualRecallMetric
+from deepeval.metrics.ragas import RAGASContextualRecallMetric
 ```
diff --git a/docs/docs/metrics-contextual-relevancy.mdx b/docs/docs/metrics-contextual-relevancy.mdx
@@ -59,5 +59,5 @@ There are three optional parameters when creating a `ContextualRelevancyMetricMe
 You can also choose to fallback to Ragas' contextual relevancy metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python
-from deepeval.metrics import RAGASContextualRelevancyMetric
+from deepeval.metrics.ragas import RAGASContextualRelevancyMetric
 ```
diff --git a/docs/docs/metrics-faithfulness.mdx b/docs/docs/metrics-faithfulness.mdx
@@ -59,5 +59,5 @@ There are three optional parameters when creating a `FaithfulnessMetric`:
 You can also choose to fallback to Ragas' faithfulness metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python
-from deepeval.metrics import RAGASFaithfulnessMetric
+from deepeval.metrics.ragas import RAGASFaithfulnessMetric
 ```