-
Notifications
You must be signed in to change notification settings - Fork 439
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #435 from confident-ai/features/integrations
New integrations
- Loading branch information
Showing
18 changed files
with
400 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from deepeval.integrations.harness import DeepEvalHarnessCallback |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler | ||
from deepeval.integrations.llama_index.evaluators import ( | ||
AnswerRelevancyEvaluator, | ||
FaithfulnessEvaluator, | ||
ContextualRelevancyEvaluator, | ||
SummarizationEvaluator, | ||
ToxicityEvaluator, | ||
BiasEvaluator, | ||
) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
import asyncio | ||
from typing import Optional, Sequence, Any | ||
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult | ||
|
||
from deepeval.test_case import LLMTestCase | ||
from deepeval.metrics import ( | ||
AnswerRelevancyMetric, | ||
FaithfulnessMetric, | ||
SummarizationMetric, | ||
ContextualRelevancyMetric, | ||
BiasMetric, | ||
ToxicityMetric, | ||
) | ||
from deepeval.integrations.llama_index.utils import conform_contexts_type | ||
|
||
|
||
class AnswerRelevancyEvaluator(BaseEvaluator): | ||
def __init__( | ||
self, | ||
threshold: float = 0.5, | ||
include_reason: bool = True, | ||
model: Optional[str] = None, | ||
): | ||
self.threshold = threshold | ||
self.include_reason = include_reason | ||
self.model = model | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None or contexts is None: | ||
raise ValueError("Query, response, and contexts must be provided") | ||
|
||
test_case = LLMTestCase( | ||
input=query, | ||
actual_output=response, | ||
retrieval_context=conform_contexts_type(contexts), | ||
) | ||
metric = AnswerRelevancyMetric( | ||
threshold=self.threshold, | ||
include_reason=self.include_reason, | ||
model=self.model, | ||
) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) | ||
|
||
|
||
class FaithfulnessEvaluator(BaseEvaluator): | ||
def __init__( | ||
self, | ||
threshold: float = 0.5, | ||
include_reason: bool = True, | ||
model: Optional[str] = None, | ||
): | ||
self.threshold = threshold | ||
self.include_reason = include_reason | ||
self.model = model | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None or contexts is None: | ||
raise ValueError("Query, response, and contexts must be provided") | ||
|
||
test_case = LLMTestCase( | ||
input=query, | ||
actual_output=response, | ||
retrieval_context=conform_contexts_type(contexts), | ||
) | ||
metric = FaithfulnessMetric( | ||
threshold=self.threshold, | ||
include_reason=self.include_reason, | ||
model=self.model, | ||
) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) | ||
|
||
|
||
class ContextualRelevancyEvaluator(BaseEvaluator): | ||
def __init__( | ||
self, | ||
threshold: float = 0.5, | ||
include_reason: bool = True, | ||
model: Optional[str] = None, | ||
): | ||
self.threshold = threshold | ||
self.include_reason = include_reason | ||
self.model = model | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None or contexts is None: | ||
raise ValueError("Query, response, and contexts must be provided") | ||
|
||
test_case = LLMTestCase( | ||
input=query, | ||
actual_output=response, | ||
retrieval_context=conform_contexts_type(contexts), | ||
) | ||
metric = ContextualRelevancyMetric( | ||
threshold=self.threshold, | ||
include_reason=self.include_reason, | ||
model=self.model, | ||
) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) | ||
|
||
|
||
class SummarizationEvaluator(BaseEvaluator): | ||
def __init__( | ||
self, | ||
threshold: float = 0.5, | ||
model: Optional[str] = None, | ||
): | ||
self.threshold = threshold | ||
self.model = model | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
del contexts # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None: | ||
raise ValueError("Query and response must be provided") | ||
|
||
test_case = LLMTestCase(input=query, actual_output=response) | ||
metric = SummarizationMetric(threshold=self.threshold, model=self.model) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) | ||
|
||
|
||
class BiasEvaluator(BaseEvaluator): | ||
def __init__(self, threshold: float = 0.5): | ||
self.threshold = threshold | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
del contexts # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None: | ||
raise ValueError("Query and response must be provided") | ||
|
||
test_case = LLMTestCase( | ||
input=query, | ||
actual_output=response, | ||
) | ||
metric = BiasMetric(threshold=self.threshold) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) | ||
|
||
|
||
class ToxicityEvaluator(BaseEvaluator): | ||
def __init__(self, threshold: float = 0.5): | ||
self.threshold = threshold | ||
|
||
def _get_prompts(self): | ||
pass | ||
|
||
def _update_prompts(self): | ||
pass | ||
|
||
async def aevaluate( | ||
self, | ||
query: Optional[str] = None, | ||
response: Optional[str] = None, | ||
contexts: Optional[Sequence[str]] = None, | ||
sleep_time_in_seconds: int = 0, | ||
**kwargs: Any, | ||
) -> EvaluationResult: | ||
del kwargs # Unused | ||
del contexts # Unused | ||
|
||
await asyncio.sleep(sleep_time_in_seconds) | ||
|
||
if query is None or response is None: | ||
raise ValueError("Query and response must be provided") | ||
|
||
test_case = LLMTestCase( | ||
input=query, | ||
actual_output=response, | ||
) | ||
metric = ToxicityMetric(threshold=self.threshold) | ||
metric.measure(test_case) | ||
return EvaluationResult( | ||
query=query, | ||
response=response, | ||
passing=metric.is_successful(), | ||
score=metric.score, | ||
feedback=metric.reason, | ||
) |
39 changes: 39 additions & 0 deletions
39
deepeval/integrations/llama_index/tests/test_evaluators.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import pytest | ||
from deepeval.integrations.llama_index import ( | ||
AnswerRelevancyEvaluator, | ||
FaithfulnessEvaluator, | ||
ContextualRelevancyEvaluator, | ||
SummarizationEvaluator, | ||
BiasEvaluator, | ||
ToxicityEvaluator, | ||
) | ||
|
||
|
||
def test_answer_relevancy(): | ||
evaluator = AnswerRelevancyEvaluator() | ||
assert evaluator is not None | ||
|
||
|
||
def test_faithfulness(): | ||
evaluator = FaithfulnessEvaluator() | ||
assert evaluator is not None | ||
|
||
|
||
def test_contextual_relevancy(): | ||
evaluator = ContextualRelevancyEvaluator() | ||
assert evaluator is not None | ||
|
||
|
||
def test_summarization(): | ||
evaluator = SummarizationEvaluator() | ||
assert evaluator is not None | ||
|
||
|
||
def test_bias(): | ||
evaluator = BiasEvaluator() | ||
assert evaluator is not None | ||
|
||
|
||
def test_toxicity(): | ||
evaluator = ToxicityEvaluator() | ||
assert evaluator is not None |
Oops, something went wrong.