Skip to content

Commit

Permalink
Merge pull request #417 from confident-ai/hotfix/separate-ragas
Browse files Browse the repository at this point in the history
Separated Ragas
  • Loading branch information
penguine-ip authored Jan 19, 2024
2 parents d6d6b81 + 02904d5 commit bbf8726
Show file tree
Hide file tree
Showing 13 changed files with 163 additions and 146 deletions.
4 changes: 2 additions & 2 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,8 @@ def print_test_result(test_result: TestResult):
print(
f" - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})"
)
if metric.score_metadata:
for metric_name, score in metric.score_metadata.items():
if metric.score_breakdown:
for metric_name, score in metric.score_breakdown.items():
print(f" - {metric_name} (score: {score})")

print("\nFor test case:\n")
Expand Down
27 changes: 14 additions & 13 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,20 @@
from .contextual_precision import ContextualPrecisionMetric
from .latency import LatencyMetric
from .cost import CostMetric
from .ragas_metric import (
RagasMetric,
RAGASAnswerRelevancyMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASContextualRelevancyMetric,
RAGASContextualPrecisionMetric,
RAGASAnswerRelevancyMetric,
RAGASConcisenessMetric as ConcisenessMetric,
RAGASCorrectnessMetric as CorrectnessMetric,
RAGASCoherenceMetric as CoherenceMetric,
RAGASMaliciousnessMetric as MaliciousnessMetric,
)

# from .ragas_metric import (
# RagasMetric,
# RAGASAnswerRelevancyMetric,
# RAGASFaithfulnessMetric,
# RAGASContextualRecallMetric,
# RAGASContextualRelevancyMetric,
# RAGASContextualPrecisionMetric,
# RAGASAnswerRelevancyMetric,
# RAGASConcisenessMetric as ConcisenessMetric,
# RAGASCorrectnessMetric as CorrectnessMetric,
# RAGASCoherenceMetric as CoherenceMetric,
# RAGASMaliciousnessMetric as MaliciousnessMetric,
# )
from .unbias_metric import UnBiasedMetric
from .non_toxic_metric import NonToxicMetric
from .hallucination_metric import HallucinationMetric
2 changes: 1 addition & 1 deletion deepeval/metrics/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class BaseMetric:
score: float = 0
score_metadata: Dict = None
score_breakdown: Dict = None
reason: Optional[str] = None
evaluation_model: Optional[str] = None

Expand Down
96 changes: 47 additions & 49 deletions deepeval/metrics/ragas_metric.py → deepeval/metrics/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -41,7 +42,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
context_precision.llm = LangchainLLM(llm=chat_model)

# Create a dataset from the test case
Expand Down Expand Up @@ -79,7 +80,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -98,7 +100,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
context_relevancy.llm = LangchainLLM(llm=chat_model)

# Create a dataset from the test case
Expand Down Expand Up @@ -135,7 +137,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -154,7 +157,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
answer_relevancy.llm = LangchainLLM(llm=chat_model)

data = {
Expand Down Expand Up @@ -185,7 +188,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -204,7 +208,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
faithfulness.llm = LangchainLLM(llm=chat_model)

data = {
Expand Down Expand Up @@ -237,7 +241,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -256,7 +261,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
context_recall.llm = LangchainLLM(llm=chat_model)

data = {
Expand All @@ -280,7 +285,7 @@ def __name__(self):
return format_ragas_metric_name("Contextual Recall")


class RAGASHarmfulnessMetric(BaseMetric):
class HarmfulnessMetric(BaseMetric):
"""This metric checks the harmfulness using Ragas"""

def __init__(
Expand All @@ -289,7 +294,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -308,7 +314,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
harmfulness.llm = LangchainLLM(llm=chat_model)

data = {
Expand All @@ -333,7 +339,7 @@ def __name__(self):
return "Harmfulness"


class RAGASCoherenceMetric(BaseMetric):
class CoherenceMetric(BaseMetric):
"""This metric checks the coherence using Ragas"""

def __init__(
Expand All @@ -342,7 +348,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
try:
Expand All @@ -359,7 +366,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
coherence.llm = LangchainLLM(llm=chat_model)

data = {
Expand All @@ -384,7 +391,7 @@ def __name__(self):
return "Coherence"


class RAGASMaliciousnessMetric(BaseMetric):
class MaliciousnessMetric(BaseMetric):
"""This metric checks the maliciousness using Ragas"""

def __init__(
Expand All @@ -393,7 +400,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
try:
Expand All @@ -411,7 +419,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
maliciousness.llm = LangchainLLM(llm=chat_model)

data = {
Expand All @@ -436,7 +444,7 @@ def __name__(self):
return "Maliciousness"


class RAGASCorrectnessMetric(BaseMetric):
class CorrectnessMetric(BaseMetric):
"""This metric checks the correctness using Ragas"""

def __init__(
Expand All @@ -445,7 +453,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
try:
Expand All @@ -463,7 +472,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
correctness.llm = LangchainLLM(llm=chat_model)

data = {
Expand All @@ -488,7 +497,7 @@ def __name__(self):
return "Correctness"


class RAGASConcisenessMetric(BaseMetric):
class ConcisenessMetric(BaseMetric):
"""This metric checks the conciseness using Ragas"""

def __init__(
Expand All @@ -497,7 +506,8 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
try:
Expand All @@ -514,7 +524,7 @@ def measure(self, test_case: LLMTestCase):
raise ModuleNotFoundError("Please install dataset")

# Set LLM model
chat_model = GPTModel(self.model).load_model()
chat_model = self.model.load_model()
conciseness.llm = LangchainLLM(llm=chat_model)

data = {
Expand Down Expand Up @@ -548,7 +558,9 @@ def __init__(
model: Optional[Union[str, BaseChatModel]] = "gpt-3.5-turbo",
):
self.threshold = threshold
self.model = model
self.model_name = model
self.model = GPTModel(model=model)
self.evaluation_model = self.model.get_model_name()

def measure(self, test_case: LLMTestCase):
# sends to server
Expand All @@ -567,37 +579,23 @@ def measure(self, test_case: LLMTestCase):

# Create a dataset from the test case
# Convert the LLMTestCase to a format compatible with Dataset
score_metadata = {}
score_breakdown = {}
metrics = [
RAGASContextualPrecisionMetric(model=self.model),
RAGASContextualRecallMetric(model=self.model),
RAGASFaithfulnessMetric(model=self.model),
RAGASAnswerRelevancyMetric(model=self.model),
RAGASContextualPrecisionMetric(model=self.model_name),
RAGASContextualRecallMetric(model=self.model_name),
RAGASFaithfulnessMetric(model=self.model_name),
RAGASAnswerRelevancyMetric(model=self.model_name),
]

warnings_list = []

for metric in metrics:
score = metric.measure(test_case)
score_metadata[metric.__name__] = score
if score == 0:
warnings_list.append(
f"The RAGAS score will be 0 since {metric.__name__} has a score of 0"
)

for warning in warnings_list:
print(warning)

if any(score == 0 for score in score_metadata.values()):
ragas_score = 0
else:
ragas_score = len(score_metadata) / sum(
1.0 / score for score in score_metadata.values()
)
score_breakdown[metric.__name__] = score

ragas_score = sum(score_breakdown.values()) / len(score_breakdown)

self.success = ragas_score >= self.threshold
self.score = ragas_score
self.score_metadata = score_metadata
self.score_breakdown = score_breakdown
return self.score

def is_successful(self):
Expand Down
2 changes: 1 addition & 1 deletion deepeval/metrics/summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def measure(self, test_case: LLMTestCase):
summarization_score = min(alignment_score, inclusion_score)

self.success = summarization_score >= self.threshold
self.score_metadata = {
self.score_breakdown = {
"Alignment": alignment_score,
"Inclusion": inclusion_score,
}
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/metrics-answer-relevancy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,5 @@ There are three optional parameters when creating an `AnswerRelevancyMetric`:
You can also choose to fallback to Ragas' answer relevancy metric (which has a similar implemention). This however is not capable of generating a reason.

```python
from deepeval.metrics import RAGASAnswerRelevancyMetric
from deepeval.metrics.ragas import RAGASAnswerRelevancyMetric
```
2 changes: 1 addition & 1 deletion docs/docs/metrics-contextual-precision.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ There are three optional parameters when creating a `ContextualPrecisionMetric`:
You can also choose to fallback to Ragas' contextual precision metric (which has a similar implemention). This however is not capable of generating a reason.

```python
from deepeval.metrics import RAGASContextualPrecisionMetric
from deepeval.metrics.ragas import RAGASContextualPrecisionMetric
```
2 changes: 1 addition & 1 deletion docs/docs/metrics-contextual-recall.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ There are three optional parameters when creating a `ContextualRecallMetric`:
You can also choose to fallback to Ragas' contextual recall metric (which has a similar implemention). This however is not capable of generating a reason.

```python
from deepeval.metrics import RAGASContextualRecallMetric
from deepeval.metrics.ragas import RAGASContextualRecallMetric
```
2 changes: 1 addition & 1 deletion docs/docs/metrics-contextual-relevancy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,5 @@ There are three optional parameters when creating a `ContextualRelevancyMetricMe
You can also choose to fallback to Ragas' contextual relevancy metric (which has a similar implemention). This however is not capable of generating a reason.

```python
from deepeval.metrics import RAGASContextualRelevancyMetric
from deepeval.metrics.ragas import RAGASContextualRelevancyMetric
```
2 changes: 1 addition & 1 deletion docs/docs/metrics-faithfulness.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,5 @@ There are three optional parameters when creating a `FaithfulnessMetric`:
You can also choose to fallback to Ragas' faithfulness metric (which has a similar implemention). This however is not capable of generating a reason.

```python
from deepeval.metrics import RAGASFaithfulnessMetric
from deepeval.metrics.ragas import RAGASFaithfulnessMetric
```
Loading

0 comments on commit bbf8726

Please sign in to comment.