Added chat models to ragas

confident-ai · Jan 16, 2024 · 637081f · 637081f
1 parent 6e31148
commit 637081f
Show file tree

Hide file tree

Showing 9 changed files with 51 additions and 15 deletions.
diff --git a/deepeval/metrics/ragas_metric.py b/deepeval/metrics/ragas_metric.py
@@ -1,8 +1,9 @@
 """An implementation of the Ragas metric
 """
-from typing import Optional
-
+from typing import Optional, Union
 from ragas.llms import LangchainLLM
+from langchain_core.language_models import BaseChatModel
+
 from deepeval.metrics import BaseMetric
 from deepeval.test_case import LLMTestCase
 from deepeval.models import GPTModel
@@ -18,7 +19,7 @@ class RAGASContextualPrecisionMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -75,7 +76,7 @@ class RAGASContextualRelevancyMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -131,7 +132,7 @@ class RAGASAnswerRelevancyMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -181,7 +182,7 @@ class RAGASFaithfulnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -233,7 +234,7 @@ class RAGASContextualRecallMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -285,7 +286,7 @@ class RAGASHarmfulnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -338,7 +339,7 @@ class RAGASCoherenceMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -389,7 +390,7 @@ class RAGASMaliciousnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -441,7 +442,7 @@ class RAGASCorrectnessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -493,7 +494,7 @@ class RAGASConcisenessMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model
@@ -544,7 +545,7 @@ class RagasMetric(BaseMetric):
     def __init__(
         self,
         threshold: float = 0.3,
-        model: Optional[str] = "gpt-3.5-turbo",
+        model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
     ):
         self.threshold = threshold
         self.model = model

diff --git a/docs/docs/metrics-answer-relevancy.mdx b/docs/docs/metrics-answer-relevancy.mdx
@@ -46,6 +46,12 @@ print(metric.reason)
 evaluate([test_case], [metric])
 ```
 
+There are three optional parameters when creating the `AnswerRelevancyMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+
 You can also choose to fallback to Ragas' answer relevancy metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python

diff --git a/docs/docs/metrics-contextual-precision.mdx b/docs/docs/metrics-contextual-precision.mdx
@@ -46,6 +46,12 @@ print(metric.reason)
 evaluate([test_case], [metric])
 ```
 
+There are three optional parameters when creating the `ContextualPrecisionMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+
 You can also choose to fallback to Ragas' contextual precision metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python

diff --git a/docs/docs/metrics-contextual-recall.mdx b/docs/docs/metrics-contextual-recall.mdx
@@ -51,6 +51,12 @@ print(metric.reason)
 evaluate([test_case], [metric])
 ```
 
+There are three optional parameters when creating the `ContextualRecallMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+
 You can also choose to fallback to Ragas' contextual recall metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python

diff --git a/docs/docs/metrics-contextual-relevancy.mdx b/docs/docs/metrics-contextual-relevancy.mdx
@@ -50,6 +50,12 @@ print(metric.reason)
 evaluate([test_case], [metric])
 ```
 
+There are three optional parameters when creating the `ContextualRelevancyMetricMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+
 You can also choose to fallback to Ragas' contextual relevancy metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python

diff --git a/docs/docs/metrics-faithfulness.mdx b/docs/docs/metrics-faithfulness.mdx
@@ -50,6 +50,12 @@ print(metric.reason)
 evaluate([test_case], [metric])
 ```
 
+There are three optional parameters when creating the `FaithfulnessMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+
 You can also choose to fallback to Ragas' faithfulness metric (which has a similar implemention). This however is not capable of generating a reason.
 
 ```python

diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx
@@ -94,7 +94,7 @@ answer_relevancy_metric = AnswerRelevancyMetric(model=custom_azure_openai_model)
 ```
 
 :::note
-While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM is not. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
+While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM has to be set each time you instantiate a metric. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
 :::
 
 ## Measuring a Metric

diff --git a/docs/docs/metrics-ragas.mdx b/docs/docs/metrics-ragas.mdx
@@ -55,3 +55,8 @@ print(metric.score)
 # or evaluate test cases in bulk
 evaluate([test_case], [metric])
 ```
+
+There are three optional parameters when creating the `RAGASMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-3.5-turbo'.
diff --git a/docs/docs/metrics-summarization.mdx b/docs/docs/metrics-summarization.mdx
@@ -65,7 +65,7 @@ evaluate([test_case], [metric])
 There are five optional parameters when instantiating an `SummarizationMetric` class:
 
 - [Optional] `threshold`: the passing threshold, defaulted to 0.5.
-- [Optional] `model`: the model name. This is defaulted to 'gpt-4-1106-preview' and we currently only support models from (Azure) OpenAI.
+- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
 - [Optional] `assessment_questions`: a list of **close-ended questions that can be answered with either a 'yes' or a 'no'**. These are questions you want your summary to be able to ideally answer, and is especially helpful if you already know what a good summary for your use case looks like. If `assessment_questions` is not provided, we will generate a set of `assessment_questions` for you at evaluation time. The `assessment_questions` are used to calculate the `inclusion_score`.
 - [Optional] `n`: the number of questions to generate when calculating the `alignment_score` and `inclusion_score`, defaulted to 5.
-Original file line number
+Diff line change
@@ Expand Up @@
     ```
     :::note
-    While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM is not. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
+    While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM has to be set each time you instantiate a metric. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
     :::
     ## Measuring a Metric
@@ Expand Down @@