Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge from main. #28

Merged
merged 21 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_case():

# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra costs."
hallucination_metric = HallucinationMetric(minimum_score=0.7)
hallucination_metric = HallucinationMetric(threshold=0.7)
test_case = LLMTestCase(input=input, actual_output=actual_output, context=context)
assert_test(test_case, [hallucination_metric])
```
Expand All @@ -108,8 +108,8 @@ deepeval test run test_chatbot.py
**Your test should have passed ✅** Let's breakdown what happened.

- The variable `input` mimics user input, and `actual_output` is a placeholder for your chatbot's intended output based on this query.
- The variable `context` contains the relevant information from your knowledge base, and `HallucinationMetric(minimum_score=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
- The metric score ranges from 0 - 1. The `minimum_score=0.7` threshold ultimately determines whether your test has passed or not.
- The variable `context` contains the relevant information from your knowledge base, and `HallucinationMetric(threshold=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
- The metric score ranges from 0 - 1. The `threshold=0.7` threshold ultimately determines whether your test has passed or not.

[Read our documentation](https://docs.confident-ai.com/docs/getting-started) for more information on how to use additional metrics, create your own custom metrics, and tutorials on how to integrate with other tools like LangChain and LlamaIndex.

Expand All @@ -120,7 +120,7 @@ deepeval test run test_chatbot.py
Alternatively, you can evaluate without Pytest, which is more suited for a notebook environment.

```python
from deepeval import evalate
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

Expand All @@ -129,13 +129,13 @@ context = ["All customers are eligible for a 30 day full refund at no extra cost
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra costs."

hallucination_metric = HallucinationMetric(minimum_score=0.7)
hallucination_metric = HallucinationMetric(threshold=0.7)
test_case = LLMTestCase(
input=input,
actual_output=actual_output,
context=context
)
evalate([test_case], [hallucination_metric])
evaluate([test_case], [hallucination_metric])
```

## Evaluting a Dataset / Test Cases in Bulk
Expand All @@ -159,8 +159,8 @@ dataset = EvaluationDataset(test_cases=[first_test_case, second_test_case])
dataset,
)
def test_customer_chatbot(test_case: LLMTestCase):
hallucination_metric = HallucinationMetric(minimum_score=0.3)
answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
hallucination_metric = HallucinationMetric(threshold=0.3)
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
assert_test(test_case, [hallucination_metric, answer_relevancy_metric])
```

Expand Down
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.20.44"
__version__: str = "0.20.46"
6 changes: 3 additions & 3 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]):
]
failed_metrics_str = ", ".join(
[
f"{metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score})"
f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold})"
for metric in failed_metrics
]
)
Expand All @@ -138,11 +138,11 @@ def print_test_result(test_result: TestResult):
for metric in test_result.metrics:
if not metric.is_successful():
print(
f" - ❌ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score}, reason: {metric.reason})"
f" - ❌ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, reason: {metric.reason})"
)
else:
print(
f" - ✅ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score}, reason: {metric.reason})"
f" - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, reason: {metric.reason})"
)
if metric.score_metadata:
for metric_name, score in metric.score_metadata.items():
Expand Down
2 changes: 2 additions & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
from .contextual_precision import ContextualPrecisionMetric
from .ragas_metric import (
RagasMetric,
RAGASAnswerRelevancyMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASContextualRelevancyMetric,
RAGASContextualPrecisionMetric,
RAGASAnswerRelevancyMetric,
RAGASConcisenessMetric as ConcisenessMetric,
RAGASCorrectnessMetric as CorrectnessMetric,
RAGASCoherenceMetric as CoherenceMetric,
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class AnswerRelvancyVerdict(BaseModel):
class AnswerRelevancyMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason
self.n = 5
Expand All @@ -49,7 +49,7 @@ def measure(self, test_case: LLMTestCase) -> float:
self.reason = self._generate_reason(
test_case.input, test_case.actual_output, answer_relevancy_score
)
self.success = answer_relevancy_score >= self.minimum_score
self.success = answer_relevancy_score >= self.threshold
self.score = answer_relevancy_score
return self.score

Expand Down Expand Up @@ -115,6 +115,7 @@ def _generate_key_points(
return data["key_points"]

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
22 changes: 5 additions & 17 deletions deepeval/metrics/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,22 @@


class BaseMetric:
# set an arbitrary minimum score that will get over-ridden later
score: float = 0
score_metadata: Dict = None
reason: Optional[str] = None

@property
def minimum_score(self) -> float:
return self._minimum_score
def threshold(self) -> float:
return self._threshold

@minimum_score.setter
def minimum_score(self, value: float):
self._minimum_score = value
@threshold.setter
def threshold(self, value: float):
self._threshold = value

# Measure function signature is subject to be different - not sure
# how applicable this is - might need a better abstraction
@abstractmethod
def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
raise NotImplementedError

def _get_init_values(self):
# We use this method for sending useful metadata
init_values = {
param: getattr(self, param)
for param in vars(self)
if isinstance(getattr(self, param), (str, int, float))
}
return init_values

@abstractmethod
def is_successful(self) -> bool:
raise NotImplementedError
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ class ContextualPrecisionVerdict(BaseModel):
class ContextualPrecisionMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.include_reason = include_reason
self.model = model

Expand Down Expand Up @@ -52,7 +52,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.input, contextual_precision_score
)

self.success = contextual_precision_score >= self.minimum_score
self.success = contextual_precision_score >= self.threshold
self.score = contextual_precision_score
return self.score

Expand Down Expand Up @@ -136,6 +136,7 @@ def _generate_verdicts(
return verdicts

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class ContextualRecallVerdict(BaseModel):
class ContextualRecallMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason
self.n = 5
Expand Down Expand Up @@ -50,7 +50,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.expected_output, contextual_recall_score
)

self.success = contextual_recall_score >= self.minimum_score
self.success = contextual_recall_score >= self.threshold
self.score = contextual_recall_score
return self.score

Expand Down Expand Up @@ -102,6 +102,7 @@ def _generate_verdicts(
return verdicts

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ class ContextualRelevancyVerdict(BaseModel):
class ContextualRelevancyMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = "gpt-4",
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason

Expand All @@ -48,7 +48,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.input, contextual_recall_score
)

self.success = contextual_recall_score >= self.minimum_score
self.success = contextual_recall_score >= self.threshold
self.score = contextual_recall_score

return self.score
Expand Down Expand Up @@ -131,6 +131,7 @@ def _generate_verdicts_list(
return verdicts_list

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ class FaithfulnessVerdict(BaseModel):
class FaithfulnessMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
# Don't set self.chat_model when using threading
self.model = model
self.include_reason = include_reason
Expand All @@ -49,7 +49,7 @@ def measure(self, test_case: LLMTestCase):
)
faithfulness_score = self._generate_score()
self.reason = self._generate_reason(faithfulness_score)
self.success = faithfulness_score >= self.minimum_score
self.success = faithfulness_score >= self.threshold
self.score = faithfulness_score
return self.score

Expand Down Expand Up @@ -172,6 +172,7 @@ def _generate_verdicts_list(
return verdicts_list

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/hallucination_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
class HallucinationMetric(BaseMetric, metaclass=Singleton):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
self.minimum_score = minimum_score
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
if test_case.actual_output is None or test_case.context is None:
Expand All @@ -25,11 +25,12 @@ def measure(self, test_case: LLMTestCase):
if score > max_score:
max_score = score

self.success = max_score > self.minimum_score
self.success = max_score >= self.threshold
self.score = max_score
return max_score

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
9 changes: 5 additions & 4 deletions deepeval/metrics/judgemental_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(
criteria: str,
evaluation_params: List[LLMTestCaseParams],
language: Languages = Languages.ENGLISH,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
if not isinstance(language, Languages):
raise TypeError("'language' must be an instance of Languages.")
Expand All @@ -33,7 +33,7 @@ def __init__(
self.name = name
self.evaluation_params = evaluation_params
self.language = language.value
self.minimum_score = minimum_score
self.threshold = threshold
self.success = None
self.reason = None

Expand Down Expand Up @@ -70,9 +70,10 @@ def measure(self, test_case: LLMTestCase):
)
self.reason = response.reason
self.score = response.score / 10
self.success = self.score >= self.minimum_score
self.success = self.score >= self.threshold

return self.score

def is_successful(self):
def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success
9 changes: 5 additions & 4 deletions deepeval/metrics/llm_eval_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
model: Optional[str] = None,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
self.name = name
self.evaluation_params = evaluation_params
Expand All @@ -50,7 +50,7 @@ def __init__(
self.criteria = criteria
self.model = model
self.evaluation_steps = evaluation_steps
self.minimum_score = minimum_score
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
Expand All @@ -73,10 +73,11 @@ def measure(self, test_case: LLMTestCase):
score, reason = self.evaluate(test_case)
self.reason = reason
self.score = float(score) / 10
self.success = score >= self.minimum_score
self.success = score >= self.threshold
return self.score

def is_successful(self):
def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

def generate_evaluation_steps(self):
Expand Down
Loading
Loading