Skip to content

Commit

Permalink
Merge branch 'anindya/harness' of https://github.com/Anindyadeep/deep…
Browse files Browse the repository at this point in the history
…eval into anindya/harness
  • Loading branch information
Anindyadeep committed Jan 14, 2024
2 parents 6715c8b + 6e56531 commit 0d9becf
Show file tree
Hide file tree
Showing 56 changed files with 587 additions and 490 deletions.
Binary file modified .DS_Store
Binary file not shown.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def test_case():

# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra costs."
hallucination_metric = HallucinationMetric(minimum_score=0.7)
hallucination_metric = HallucinationMetric(threshold=0.7)
test_case = LLMTestCase(input=input, actual_output=actual_output, context=context)
assert_test(test_case, [hallucination_metric])
```
Expand All @@ -108,8 +108,8 @@ deepeval test run test_chatbot.py
**Your test should have passed ✅** Let's breakdown what happened.

- The variable `input` mimics user input, and `actual_output` is a placeholder for your chatbot's intended output based on this query.
- The variable `context` contains the relevant information from your knowledge base, and `HallucinationMetric(minimum_score=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
- The metric score ranges from 0 - 1. The `minimum_score=0.7` threshold ultimately determines whether your test has passed or not.
- The variable `context` contains the relevant information from your knowledge base, and `HallucinationMetric(threshold=0.7)` is an out-of-the-box metric provided by DeepEval. It helps you evaluate the factual accuracy of your chatbot's output based on the provided context.
- The metric score ranges from 0 - 1. The `threshold=0.7` threshold ultimately determines whether your test has passed or not.

[Read our documentation](https://docs.confident-ai.com/docs/getting-started) for more information on how to use additional metrics, create your own custom metrics, and tutorials on how to integrate with other tools like LangChain and LlamaIndex.

Expand All @@ -120,7 +120,7 @@ deepeval test run test_chatbot.py
Alternatively, you can evaluate without Pytest, which is more suited for a notebook environment.

```python
from deepeval import evalate
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase

Expand All @@ -129,13 +129,13 @@ context = ["All customers are eligible for a 30 day full refund at no extra cost
# Replace this with the actual output from your LLM application
actual_output = "We offer a 30-day full refund at no extra costs."

hallucination_metric = HallucinationMetric(minimum_score=0.7)
hallucination_metric = HallucinationMetric(threshold=0.7)
test_case = LLMTestCase(
input=input,
actual_output=actual_output,
context=context
)
evalate([test_case], [hallucination_metric])
evaluate([test_case], [hallucination_metric])
```

## Evaluting a Dataset / Test Cases in Bulk
Expand All @@ -159,8 +159,8 @@ dataset = EvaluationDataset(test_cases=[first_test_case, second_test_case])
dataset,
)
def test_customer_chatbot(test_case: LLMTestCase):
hallucination_metric = HallucinationMetric(minimum_score=0.3)
answer_relevancy_metric = AnswerRelevancyMetric(minimum_score=0.5)
hallucination_metric = HallucinationMetric(threshold=0.3)
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
assert_test(test_case, [hallucination_metric, answer_relevancy_metric])
```

Expand Down
2 changes: 1 addition & 1 deletion deepeval/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "0.20.44"
__version__: str = "0.20.46"
6 changes: 3 additions & 3 deletions deepeval/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]):
]
failed_metrics_str = ", ".join(
[
f"{metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score})"
f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold})"
for metric in failed_metrics
]
)
Expand All @@ -138,11 +138,11 @@ def print_test_result(test_result: TestResult):
for metric in test_result.metrics:
if not metric.is_successful():
print(
f" - ❌ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score}, reason: {metric.reason})"
f" - ❌ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, reason: {metric.reason})"
)
else:
print(
f" - ✅ {metric.__name__} (score: {metric.score}, minimum_score: {metric.minimum_score}, reason: {metric.reason})"
f" - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, reason: {metric.reason})"
)
if metric.score_metadata:
for metric_name, score in metric.score_metadata.items():
Expand Down
2 changes: 2 additions & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
from .contextual_precision import ContextualPrecisionMetric
from .ragas_metric import (
RagasMetric,
RAGASAnswerRelevancyMetric,
RAGASFaithfulnessMetric,
RAGASContextualRecallMetric,
RAGASContextualRelevancyMetric,
RAGASContextualPrecisionMetric,
RAGASAnswerRelevancyMetric,
RAGASConcisenessMetric as ConcisenessMetric,
RAGASCorrectnessMetric as CorrectnessMetric,
RAGASCoherenceMetric as CoherenceMetric,
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/answer_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class AnswerRelvancyVerdict(BaseModel):
class AnswerRelevancyMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason
self.n = 5
Expand All @@ -49,7 +49,7 @@ def measure(self, test_case: LLMTestCase) -> float:
self.reason = self._generate_reason(
test_case.input, test_case.actual_output, answer_relevancy_score
)
self.success = answer_relevancy_score >= self.minimum_score
self.success = answer_relevancy_score >= self.threshold
self.score = answer_relevancy_score
return self.score

Expand Down Expand Up @@ -115,6 +115,7 @@ def _generate_key_points(
return data["key_points"]

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
22 changes: 5 additions & 17 deletions deepeval/metrics/base_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,22 @@


class BaseMetric:
# set an arbitrary minimum score that will get over-ridden later
score: float = 0
score_metadata: Dict = None
reason: Optional[str] = None

@property
def minimum_score(self) -> float:
return self._minimum_score
def threshold(self) -> float:
return self._threshold

@minimum_score.setter
def minimum_score(self, value: float):
self._minimum_score = value
@threshold.setter
def threshold(self, value: float):
self._threshold = value

# Measure function signature is subject to be different - not sure
# how applicable this is - might need a better abstraction
@abstractmethod
def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
raise NotImplementedError

def _get_init_values(self):
# We use this method for sending useful metadata
init_values = {
param: getattr(self, param)
for param in vars(self)
if isinstance(getattr(self, param), (str, int, float))
}
return init_values

@abstractmethod
def is_successful(self) -> bool:
raise NotImplementedError
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ class ContextualPrecisionVerdict(BaseModel):
class ContextualPrecisionMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.include_reason = include_reason
self.model = model

Expand Down Expand Up @@ -52,7 +52,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.input, contextual_precision_score
)

self.success = contextual_precision_score >= self.minimum_score
self.success = contextual_precision_score >= self.threshold
self.score = contextual_precision_score
return self.score

Expand Down Expand Up @@ -136,6 +136,7 @@ def _generate_verdicts(
return verdicts

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ class ContextualRecallVerdict(BaseModel):
class ContextualRecallMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason
self.n = 5
Expand Down Expand Up @@ -50,7 +50,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.expected_output, contextual_recall_score
)

self.success = contextual_recall_score >= self.minimum_score
self.success = contextual_recall_score >= self.threshold
self.score = contextual_recall_score
return self.score

Expand Down Expand Up @@ -102,6 +102,7 @@ def _generate_verdicts(
return verdicts

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/contextual_relevancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ class ContextualRelevancyVerdict(BaseModel):
class ContextualRelevancyMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = "gpt-4",
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
self.model = model
self.include_reason = include_reason

Expand All @@ -48,7 +48,7 @@ def measure(self, test_case: LLMTestCase) -> float:
test_case.input, contextual_recall_score
)

self.success = contextual_recall_score >= self.minimum_score
self.success = contextual_recall_score >= self.threshold
self.score = contextual_recall_score

return self.score
Expand Down Expand Up @@ -131,6 +131,7 @@ def _generate_verdicts_list(
return verdicts_list

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ class FaithfulnessVerdict(BaseModel):
class FaithfulnessMetric(BaseMetric):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
model: Optional[str] = None,
include_reason: bool = True,
):
self.minimum_score = minimum_score
self.threshold = threshold
# Don't set self.chat_model when using threading
self.model = model
self.include_reason = include_reason
Expand All @@ -49,7 +49,7 @@ def measure(self, test_case: LLMTestCase):
)
faithfulness_score = self._generate_score()
self.reason = self._generate_reason(faithfulness_score)
self.success = faithfulness_score >= self.minimum_score
self.success = faithfulness_score >= self.threshold
self.score = faithfulness_score
return self.score

Expand Down Expand Up @@ -172,6 +172,7 @@ def _generate_verdicts_list(
return verdicts_list

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
7 changes: 4 additions & 3 deletions deepeval/metrics/hallucination_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
class HallucinationMetric(BaseMetric, metaclass=Singleton):
def __init__(
self,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
self.minimum_score = minimum_score
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
if test_case.actual_output is None or test_case.context is None:
Expand All @@ -25,11 +25,12 @@ def measure(self, test_case: LLMTestCase):
if score > max_score:
max_score = score

self.success = max_score > self.minimum_score
self.success = max_score >= self.threshold
self.score = max_score
return max_score

def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

@property
Expand Down
9 changes: 5 additions & 4 deletions deepeval/metrics/judgemental_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(
criteria: str,
evaluation_params: List[LLMTestCaseParams],
language: Languages = Languages.ENGLISH,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
if not isinstance(language, Languages):
raise TypeError("'language' must be an instance of Languages.")
Expand All @@ -33,7 +33,7 @@ def __init__(
self.name = name
self.evaluation_params = evaluation_params
self.language = language.value
self.minimum_score = minimum_score
self.threshold = threshold
self.success = None
self.reason = None

Expand Down Expand Up @@ -70,9 +70,10 @@ def measure(self, test_case: LLMTestCase):
)
self.reason = response.reason
self.score = response.score / 10
self.success = self.score >= self.minimum_score
self.success = self.score >= self.threshold

return self.score

def is_successful(self):
def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success
9 changes: 5 additions & 4 deletions deepeval/metrics/llm_eval_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
criteria: Optional[str] = None,
evaluation_steps: Optional[List[str]] = None,
model: Optional[str] = None,
minimum_score: float = 0.5,
threshold: float = 0.5,
):
self.name = name
self.evaluation_params = evaluation_params
Expand All @@ -50,7 +50,7 @@ def __init__(
self.criteria = criteria
self.model = model
self.evaluation_steps = evaluation_steps
self.minimum_score = minimum_score
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
Expand All @@ -73,10 +73,11 @@ def measure(self, test_case: LLMTestCase):
score, reason = self.evaluate(test_case)
self.reason = reason
self.score = float(score) / 10
self.success = score >= self.minimum_score
self.success = score >= self.threshold
return self.score

def is_successful(self):
def is_successful(self) -> bool:
self.success = self.score >= self.threshold
return self.success

def generate_evaluation_steps(self):
Expand Down
Loading

0 comments on commit 0d9becf

Please sign in to comment.