Skip to content

Commit

Permalink
Added chat models to ragas
Browse files Browse the repository at this point in the history
  • Loading branch information
penguine-ip committed Jan 16, 2024
1 parent 6e31148 commit 637081f
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 15 deletions.
27 changes: 14 additions & 13 deletions deepeval/metrics/ragas_metric.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""An implementation of the Ragas metric
"""
from typing import Optional

from typing import Optional, Union
from ragas.llms import LangchainLLM
from langchain_core.language_models import BaseChatModel

from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase
from deepeval.models import GPTModel
Expand All @@ -18,7 +19,7 @@ class RAGASContextualPrecisionMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -75,7 +76,7 @@ class RAGASContextualRelevancyMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -131,7 +132,7 @@ class RAGASAnswerRelevancyMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -181,7 +182,7 @@ class RAGASFaithfulnessMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -233,7 +234,7 @@ class RAGASContextualRecallMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -285,7 +286,7 @@ class RAGASHarmfulnessMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -338,7 +339,7 @@ class RAGASCoherenceMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -389,7 +390,7 @@ class RAGASMaliciousnessMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -441,7 +442,7 @@ class RAGASCorrectnessMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -493,7 +494,7 @@ class RAGASConcisenessMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down Expand Up @@ -544,7 +545,7 @@ class RagasMetric(BaseMetric):
def __init__(
self,
threshold: float = 0.3,
model: Optional[str] = "gpt-3.5-turbo",
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/metrics-answer-relevancy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ print(metric.reason)
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `AnswerRelevancyMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.

You can also choose to fallback to Ragas' answer relevancy metric (which has a similar implemention). This however is not capable of generating a reason.

```python
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/metrics-contextual-precision.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ print(metric.reason)
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `ContextualPrecisionMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.

You can also choose to fallback to Ragas' contextual precision metric (which has a similar implemention). This however is not capable of generating a reason.

```python
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/metrics-contextual-recall.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ print(metric.reason)
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `ContextualRecallMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.

You can also choose to fallback to Ragas' contextual recall metric (which has a similar implemention). This however is not capable of generating a reason.

```python
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/metrics-contextual-relevancy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ print(metric.reason)
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `ContextualRelevancyMetricMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.

You can also choose to fallback to Ragas' contextual relevancy metric (which has a similar implemention). This however is not capable of generating a reason.

```python
Expand Down
6 changes: 6 additions & 0 deletions docs/docs/metrics-faithfulness.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ print(metric.reason)
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `FaithfulnessMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.

You can also choose to fallback to Ragas' faithfulness metric (which has a similar implemention). This however is not capable of generating a reason.

```python
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/metrics-introduction.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ answer_relevancy_metric = AnswerRelevancyMetric(model=custom_azure_openai_model)
```

:::note
While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM is not. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
While the Azure OpenAI command configures `deepeval` to use Azure OpenAI globally for all LLM-Evals, a custom LLM has to be set each time you instantiate a metric. Remember to provide your custom LLM instance through the `model` parameter for metrics you wish to use it for.
:::

## Measuring a Metric
Expand Down
5 changes: 5 additions & 0 deletions docs/docs/metrics-ragas.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,8 @@ print(metric.score)
# or evaluate test cases in bulk
evaluate([test_case], [metric])
```

There are three optional parameters when creating the `RAGASMetric`:

- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-3.5-turbo'.
2 changes: 1 addition & 1 deletion docs/docs/metrics-summarization.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ evaluate([test_case], [metric])
There are five optional parameters when instantiating an `SummarizationMetric` class:

- [Optional] `threshold`: the passing threshold, defaulted to 0.5.
- [Optional] `model`: the model name. This is defaulted to 'gpt-4-1106-preview' and we currently only support models from (Azure) OpenAI.
- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** any one of langchain's [chat models](https://python.langchain.com/docs/integrations/chat/) of type `BaseChatModel`. Defaulted to 'gpt-4-1106-preview'.
- [Optional] `assessment_questions`: a list of **close-ended questions that can be answered with either a 'yes' or a 'no'**. These are questions you want your summary to be able to ideally answer, and is especially helpful if you already know what a good summary for your use case looks like. If `assessment_questions` is not provided, we will generate a set of `assessment_questions` for you at evaluation time. The `assessment_questions` are used to calculate the `inclusion_score`.
- [Optional] `n`: the number of questions to generate when calculating the `alignment_score` and `inclusion_score`, defaulted to 5.

Expand Down

0 comments on commit 637081f

Please sign in to comment.