Skip to content

Commit

Permalink
Merge pull request #435 from confident-ai/features/integrations
Browse files Browse the repository at this point in the history
New integrations
  • Loading branch information
penguine-ip authored Jan 25, 2024
2 parents 46a9ae4 + 14e33fd commit d7a4b93
Show file tree
Hide file tree
Showing 18 changed files with 400 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
poetry run pytest tests/ --ignore=tests/test_g_eval.py
poetry run pytest tests/
Empty file removed deepeval/callbacks/__init__.py
Empty file.
4 changes: 0 additions & 4 deletions deepeval/callbacks/huggingface/__init__.py

This file was deleted.

1 change: 1 addition & 0 deletions deepeval/integrations/harness/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.harness import DeepEvalHarnessCallback
File renamed without changes.
1 change: 1 addition & 0 deletions deepeval/integrations/hugging_face/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .rich_manager import RichManager


class DeepEvalCallback(TrainerCallback):
class DeepEvalHuggingFaceCallback(TrainerCallback):
"""
Custom callback for deep evaluation during model training.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from datasets import load_dataset

from deepeval.callbacks.huggingface import DeepEvalCallback
from deepeval.integrations.hugging_face import DeepEvalHuggingFaceCallback
from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset, Golden

Expand Down Expand Up @@ -147,8 +147,8 @@ def create_deepeval_dataset(dataset, sample_size):
)
metrics = [hallucination_metric, answer_relevancy_metric]

# initalize DeepEvalCallback
callback = DeepEvalCallback(
# initalize DeepEvalHuggingFaceCallback
callback = DeepEvalHuggingFaceCallback(
metrics=metrics,
evaluation_dataset=eval_dataset,
tokenizer_args=tokenizer_args,
Expand Down
File renamed without changes.
9 changes: 9 additions & 0 deletions deepeval/integrations/llama_index/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from deepeval.integrations.llama_index.callback import LlamaIndexCallbackHandler
from deepeval.integrations.llama_index.evaluators import (
AnswerRelevancyEvaluator,
FaithfulnessEvaluator,
ContextualRelevancyEvaluator,
SummarizationEvaluator,
ToxicityEvaluator,
BiasEvaluator,
)
295 changes: 295 additions & 0 deletions deepeval/integrations/llama_index/evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
import asyncio
from typing import Optional, Sequence, Any
from llama_index.evaluation.base import BaseEvaluator, EvaluationResult

from deepeval.test_case import LLMTestCase
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
SummarizationMetric,
ContextualRelevancyMetric,
BiasMetric,
ToxicityMetric,
)
from deepeval.integrations.llama_index.utils import conform_contexts_type


class AnswerRelevancyEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = AnswerRelevancyMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class FaithfulnessEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = FaithfulnessMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class ContextualRelevancyEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
include_reason: bool = True,
model: Optional[str] = None,
):
self.threshold = threshold
self.include_reason = include_reason
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None or contexts is None:
raise ValueError("Query, response, and contexts must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
retrieval_context=conform_contexts_type(contexts),
)
metric = ContextualRelevancyMetric(
threshold=self.threshold,
include_reason=self.include_reason,
model=self.model,
)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class SummarizationEvaluator(BaseEvaluator):
def __init__(
self,
threshold: float = 0.5,
model: Optional[str] = None,
):
self.threshold = threshold
self.model = model

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(input=query, actual_output=response)
metric = SummarizationMetric(threshold=self.threshold, model=self.model)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class BiasEvaluator(BaseEvaluator):
def __init__(self, threshold: float = 0.5):
self.threshold = threshold

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
)
metric = BiasMetric(threshold=self.threshold)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)


class ToxicityEvaluator(BaseEvaluator):
def __init__(self, threshold: float = 0.5):
self.threshold = threshold

def _get_prompts(self):
pass

def _update_prompts(self):
pass

async def aevaluate(
self,
query: Optional[str] = None,
response: Optional[str] = None,
contexts: Optional[Sequence[str]] = None,
sleep_time_in_seconds: int = 0,
**kwargs: Any,
) -> EvaluationResult:
del kwargs # Unused
del contexts # Unused

await asyncio.sleep(sleep_time_in_seconds)

if query is None or response is None:
raise ValueError("Query and response must be provided")

test_case = LLMTestCase(
input=query,
actual_output=response,
)
metric = ToxicityMetric(threshold=self.threshold)
metric.measure(test_case)
return EvaluationResult(
query=query,
response=response,
passing=metric.is_successful(),
score=metric.score,
feedback=metric.reason,
)
39 changes: 39 additions & 0 deletions deepeval/integrations/llama_index/tests/test_evaluators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pytest
from deepeval.integrations.llama_index import (
AnswerRelevancyEvaluator,
FaithfulnessEvaluator,
ContextualRelevancyEvaluator,
SummarizationEvaluator,
BiasEvaluator,
ToxicityEvaluator,
)


def test_answer_relevancy():
evaluator = AnswerRelevancyEvaluator()
assert evaluator is not None


def test_faithfulness():
evaluator = FaithfulnessEvaluator()
assert evaluator is not None


def test_contextual_relevancy():
evaluator = ContextualRelevancyEvaluator()
assert evaluator is not None


def test_summarization():
evaluator = SummarizationEvaluator()
assert evaluator is not None


def test_bias():
evaluator = BiasEvaluator()
assert evaluator is not None


def test_toxicity():
evaluator = ToxicityEvaluator()
assert evaluator is not None
Loading

0 comments on commit d7a4b93

Please sign in to comment.