Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New integrations #435

Merged
merged 3 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
poetry run pytest tests/ --ignore=tests/test_g_eval.py
poetry run pytest tests/
Empty file removed deepeval/callbacks/__init__.py
Empty file.
4 changes: 0 additions & 4 deletions deepeval/callbacks/huggingface/__init__.py

This file was deleted.

1 change: 1 addition & 0 deletions deepeval/integrations/harness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.harness import DeepEvalHarnessCallback
1 change: 1 addition & 0 deletions deepeval/integrations/hugging_face/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .rich_manager import RichManager


class DeepEvalCallback(TrainerCallback):
class DeepEvalHuggingFaceCallback(TrainerCallback):
"""
Custom callback for deep evaluation during model training.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from datasets import load_dataset

from deepeval.callbacks.huggingface import DeepEvalCallback
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden

Expand Down Expand Up @@ -147,8 +147,8 @@ def create_deepeval_dataset(dataset, sample_size):
)
metrics = [hallucination_metric, answer_relevancy_metric]

# initalize DeepEvalCallback
callback = DeepEvalCallback(
# initalize DeepEvalHuggingFaceCallback
callback = DeepEvalHuggingFaceCallback(
metrics=metrics,
evaluation_dataset=eval_dataset,
tokenizer_args=tokenizer_args,
Expand Down
9 changes: 9 additions & 0 deletions deepeval/integrations/llama_index/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler
from deepeval.integrations.llama_index.evaluators import (
AnswerRelevancyEvaluator,
FaithfulnessEvaluator,
ContextualRelevancyEvaluator,
SummarizationEvaluator,
ToxicityEvaluator,
BiasEvaluator,
)
295 changes: 295 additions & 0 deletions deepeval/integrations/llama_index/evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
import asyncio
from typing import Optional, Sequence, Any
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult

from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
SummarizationMetric,
ContextualRelevancyMetric,
BiasMetric,
ToxicityMetric,
)
from deepeval.integrations.llama_index.utils import conform_contexts_type


class AnswerRelevancyEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = AnswerRelevancyMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class FaithfulnessEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = FaithfulnessMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class ContextualRelevancyEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = ContextualRelevancyMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class SummarizationEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
model: Optional[str] = None,
):
self.threshold = threshold
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(input=query, actual_output=response)
metric = SummarizationMetric(threshold=self.threshold, model=self.model)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class BiasEvaluator(BaseEvaluator):
def __init__(self, threshold: float = 0.5):
self.threshold = threshold

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
)
metric = BiasMetric(threshold=self.threshold)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class ToxicityEvaluator(BaseEvaluator):
def __init__(self, threshold: float = 0.5):
self.threshold = threshold

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
)
metric = ToxicityMetric(threshold=self.threshold)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)
39 changes: 39 additions & 0 deletions deepeval/integrations/llama_index/tests/test_evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest
from deepeval.integrations.llama_index import (
AnswerRelevancyEvaluator,
FaithfulnessEvaluator,
ContextualRelevancyEvaluator,
SummarizationEvaluator,
BiasEvaluator,
ToxicityEvaluator,
)


def test_answer_relevancy():
evaluator = AnswerRelevancyEvaluator()
assert evaluator is not None


def test_faithfulness():
evaluator = FaithfulnessEvaluator()
assert evaluator is not None


def test_contextual_relevancy():
evaluator = ContextualRelevancyEvaluator()
assert evaluator is not None


def test_summarization():
evaluator = SummarizationEvaluator()
assert evaluator is not None


def test_bias():
evaluator = BiasEvaluator()
assert evaluator is not None


def test_toxicity():
evaluator = ToxicityEvaluator()
assert evaluator is not None
Loading