Skip to content

Commit

Permalink
linting
Browse files Browse the repository at this point in the history
  • Loading branch information
Anindyadeep committed Nov 28, 2023
1 parent d35ac7a commit e999114
Show file tree
Hide file tree
Showing 11 changed files with 107 additions and 50 deletions.
9 changes: 5 additions & 4 deletions deepeval/metrics/factual_consistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from deepeval.utils import chunk_text
from deepeval.scorer import Scorer


class FactualConsistencyMetric(BaseMetric, metaclass=Singleton):
def __init__(
self,
minimum_score: float = 0.6,
model_name: str = "cross-encoder/nli-deberta-v3-large",
):
self.model_name = model_name
self.model_name = model_name
self.minimum_score = minimum_score

def measure(self, test_case: LLMTestCase):
Expand All @@ -25,15 +26,15 @@ def measure(self, test_case: LLMTestCase):
context_list.extend(chunk_text(context))
else:
raise ValueError("Context must be a string or a list of strings")

score = Scorer.factual_consistency_score(
contexts=context_list,
prediction=test_case.actual_output,
model=self.model_name
model=self.model_name,
)
self.score = score
self.success = score > self.minimum_score
return score
return score

def is_successful(self) -> bool:
return self.success
Expand Down
7 changes: 5 additions & 2 deletions deepeval/metrics/non_toxic_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
from deepeval.metrics.base_metric import BaseMetric
from deepeval.scorer import Scorer


class NonToxicMetric(BaseMetric):
def __init__(
self,
self,
evaluation_params: List[LLMTestCaseParams],
model_name: str = "original",
minimum_score: float = 0.5,
Expand Down Expand Up @@ -40,7 +41,9 @@ def measure(self, test_case: LLMTestCase):

for param in self.evaluation_params:
text_to_evaluate = getattr(test_case, param.value)
_, results = Scorer.neural_toxic_score(prediction=text_to_evaluate, model=self.model_name)
_, results = Scorer.neural_toxic_score(
prediction=text_to_evaluate, model=self.model_name
)
# sample output
# {'toxicity': 0.98057544,
# 'severe_toxicity': 0.106649496,
Expand Down
5 changes: 4 additions & 1 deletion deepeval/metrics/unbias_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.scorer import Scorer


class UnBiasedMetric(BaseMetric):
def __init__(
self,
Expand Down Expand Up @@ -45,7 +46,9 @@ def measure(self, test_case: LLMTestCase, return_all_scores: bool = False):
) # to accumulate all individual results if return_all_scores is True

for param in self.evaluation_params:
result = Scorer.neural_bias_score(getattr(test_case, param.value), model=self.model_name)
result = Scorer.neural_bias_score(
getattr(test_case, param.value), model=self.model_name
)
if return_all_scores:
all_results.append(result)

Expand Down
2 changes: 1 addition & 1 deletion deepeval/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
from deepeval.models.summac_model import SummaCModels
from deepeval.models.factual_consistency_model import FactualConsistencyModel
from deepeval.models.detoxify_model import DetoxifyModel
from deepeval.models.unbias_model import UnBiasedModel
from deepeval.models.unbias_model import UnBiasedModel
2 changes: 2 additions & 0 deletions deepeval/models/answer_relevancy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from typing import Optional
from deepeval.models.base import DeepEvalBaseModel


def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)


class AnswerRelevancyModel(DeepEvalBaseModel):
def __init__(self, model_name: Optional[str] = None):
model_name = (
Expand Down
14 changes: 9 additions & 5 deletions deepeval/models/detoxify_model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import torch
import torch
from deepeval.models.base import DeepEvalBaseModel
from detoxify import Detoxify


class DetoxifyModel(DeepEvalBaseModel):
def __init__(self, model_name: str | None = None, *args, **kwargs):
if model_name is not None:
assert model_name in ["original", "unbiased", "multilingual"], "Invalid model. Available variants: original, unbiased, multilingual"
model_name = 'original' if model_name is None else model_name
assert model_name in [
"original",
"unbiased",
"multilingual",
], "Invalid model. Available variants: original, unbiased, multilingual"
model_name = "original" if model_name is None else model_name
super().__init__(model_name, *args, **kwargs)

def load_model(self):
device = "cuda" if torch.cuda.is_available() else "cpu"
return Detoxify(self.model_name, device=device)
Expand All @@ -19,4 +24,3 @@ def _call(self, text: str):
toxicity_score_dict
)
return mean_toxicity_score, toxicity_score_dict

14 changes: 9 additions & 5 deletions deepeval/models/factual_consistency_model.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import os
import os
from deepeval.models.base import DeepEvalBaseModel
from sentence_transformers import CrossEncoder
from deepeval.utils import softmax


class FactualConsistencyModel(DeepEvalBaseModel):
def __init__(self, model_name: str | None = None, *args, **kwargs):
model_name = "cross-encoder/nli-deberta-v3-large" if model_name is None else model_name
model_name = (
"cross-encoder/nli-deberta-v3-large"
if model_name is None
else model_name
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
super().__init__(model_name, *args, **kwargs)

def load_model(self):
return CrossEncoder(self.model_name)

def _call(self, text_a: str, text_b: str):
scores = self.model.predict([(text_a, text_b), (text_b, text_a)])
# https://huggingface.co/cross-encoder/nli-deberta-base
Expand All @@ -20,4 +25,3 @@ def _call(self, text_a: str, text_b: str):
score = softmax_scores[0][1]
second_score = softmax_scores[1][1]
return max(score, second_score)

12 changes: 7 additions & 5 deletions deepeval/models/hallucination_model.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import os
from typing import Optional
from typing import Optional
from deepeval.singleton import Singleton
from sentence_transformers import CrossEncoder
from deepeval.progress_context import progress_context


class HallucinationModel(metaclass=Singleton):
def __init__(
self, model_name: Optional[str] = None
):
def __init__(self, model_name: Optional[str] = None):
# We use a smple cross encoder model
model_name = "vectara/hallucination_evaluation_model" if model_name is None else model_name
model_name = (
"vectara/hallucination_evaluation_model"
if model_name is None
else model_name
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# TODO: add this progress context in the correct place
Expand Down
54 changes: 43 additions & 11 deletions deepeval/models/summac_model.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,64 @@
import torch
from typing import Union, List
import torch
from typing import Union, List
from typing import List, Union, get_origin
from deepeval.models.base import DeepEvalBaseModel
from deepeval.models._summac_model import _SummaCZS


class SummaCModels(DeepEvalBaseModel):
def __init__(self, model_name: str | None = None, granularity: str | None = None, device: str | None = None, *args, **kwargs):
def __init__(
self,
model_name: str | None = None,
granularity: str | None = None,
device: str | None = None,
*args,
**kwargs
):
model_name = "vitc" if model_name is None else model_name
self.granularity = "sentence" if granularity is None else granularity
self.device = device if device is not None else "cuda" if torch.cuda.is_available() else "cpu"
self.device = (
device
if device is not None
else "cuda"
if torch.cuda.is_available()
else "cpu"
)
super().__init__(model_name, *args, **kwargs)

def load_model(self, op1: str | None = "max", op2: str | None = "mean", use_ent: bool | None = True, use_con: bool | None = True, image_load_cache: bool | None = True, **kwargs):

def load_model(
self,
op1: str | None = "max",
op2: str | None = "mean",
use_ent: bool | None = True,
use_con: bool | None = True,
image_load_cache: bool | None = True,
**kwargs
):
return _SummaCZS(
model_name=self.model_name,
granularity=self.granularity,
device=self.device,
op1=op1, op2=op2, use_con=use_con, use_ent=use_ent,
op1=op1,
op2=op2,
use_con=use_con,
use_ent=use_ent,
imager_load_cache=image_load_cache,
**kwargs
)

def _call(self, predictions: Union[str, List[str]], targets: Union[str, List[str]]) -> Union[float, dict]:

def _call(
self, predictions: Union[str, List[str]], targets: Union[str, List[str]]
) -> Union[float, dict]:
list_type = List[str]

if get_origin(predictions) is list_type and get_origin(targets) is list_type:
if (
get_origin(predictions) is list_type
and get_origin(targets) is list_type
):
return self.model.score(targets, predictions)
elif isinstance(predictions, str) and isinstance(targets, str):
return self.model.score_one(targets, predictions)
else:
raise TypeError('Either both predictions and targets should be List or both should be string')
raise TypeError(
"Either both predictions and targets should be List or both should be string"
)
7 changes: 4 additions & 3 deletions deepeval/models/unbias_model.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
from typing import Optional
from deepeval.models.base import DeepEvalBaseModel


class UnBiasedModel(DeepEvalBaseModel):
def __init__(self, model_name: str | None = None, *args, **kwargs):
model_name = "original" if model_name is None else model_name
super().__init__(model_name, *args, **kwargs)

def load_model(self):
try:
from Dbias.bias_classification import classifier
except ImportError as e:
print("Run `pip install deepeval[bias]`")
return classifier

def _call(self, text):
return self.model(text)
return self.model(text)
31 changes: 18 additions & 13 deletions deepeval/scorer/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,15 @@ def bert_score(
"bert-recall": recall.detach().numpy().tolist(),
"bert-f1": f1.detach().numpy().tolist(),
}

@classmethod
def faithfulness_score(
cls, target: str, prediction: str, model: Optional[str] = None, granularity: Optional[str]=None, device: Optional[str]=None
cls,
target: str,
prediction: str,
model: Optional[str] = None,
granularity: Optional[str] = None,
device: Optional[str] = None,
) -> float:
"""Calculate the faithfulness score of a prediction compared to a target text using SummaCZS.
Expand All @@ -188,18 +193,16 @@ def faithfulness_score(
Returns:
float: The computed faithfulness score. Higher values indicate greater faithfulness to the target text.
Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfullness.
Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfullness.
"""
try:
from deepeval.models import SummaCModels
except Exception as e:
print(f"SummaCZS model can not be loaded.\n{e}")

scorer = SummaCModels(
model_name=model,
granularity=granularity,
device=device
model_name=model, granularity=granularity, device=device
)
return scorer(target, prediction)["score"]

Expand Down Expand Up @@ -337,27 +340,29 @@ def answer_relevancy_score(

@classmethod
def factual_consistency_score(
cls, contexts: Union[List[str], str], prediction: str, model: Optional[str]=None
cls,
contexts: Union[List[str], str],
prediction: str,
model: Optional[str] = None,
) -> float:
try:
from deepeval.models import FactualConsistencyModel
except Exception as e:
print(f"Unable to load FactualConsistencyModel\n{e}")

scorer = FactualConsistencyModel(model)
contexts = [contexts] if isinstance(contexts, str) else contexts
max_score = 0
max_score = 0
for context in contexts:
score = scorer.predict(context, prediction)
max_score = max(max_score, score)
return max_score

@classmethod
def neural_bias_score(cls, text: str, model: Optional[str]=None) -> float:
def neural_bias_score(cls, text: str, model: Optional[str] = None) -> float:
try:
from deepeval.models import UnBiasedModel
except Exception as e:
print(f"Unable to load UnBiasedModel.\n{e}")
scorer = UnBiasedModel(model_name=model)
return scorer(text)

0 comments on commit e999114

Please sign in to comment.