From e9991145a5e2705f905af3c4ff32bb2e8a5ba4e0 Mon Sep 17 00:00:00 2001 From: Anindyadeep Date: Wed, 29 Nov 2023 02:31:24 +0530 Subject: [PATCH] linting --- deepeval/metrics/factual_consistency.py | 9 ++-- deepeval/metrics/non_toxic_metric.py | 7 ++- deepeval/metrics/unbias_metric.py | 5 +- deepeval/models/__init__.py | 2 +- deepeval/models/answer_relevancy_model.py | 2 + deepeval/models/detoxify_model.py | 14 +++-- deepeval/models/factual_consistency_model.py | 14 +++-- deepeval/models/hallucination_model.py | 12 +++-- deepeval/models/summac_model.py | 54 ++++++++++++++++---- deepeval/models/unbias_model.py | 7 +-- deepeval/scorer/scorer.py | 31 ++++++----- 11 files changed, 107 insertions(+), 50 deletions(-) diff --git a/deepeval/metrics/factual_consistency.py b/deepeval/metrics/factual_consistency.py index 75215dcb5..854c4826b 100644 --- a/deepeval/metrics/factual_consistency.py +++ b/deepeval/metrics/factual_consistency.py @@ -4,13 +4,14 @@ from deepeval.utils import chunk_text from deepeval.scorer import Scorer + class FactualConsistencyMetric(BaseMetric, metaclass=Singleton): def __init__( self, minimum_score: float = 0.6, model_name: str = "cross-encoder/nli-deberta-v3-large", ): - self.model_name = model_name + self.model_name = model_name self.minimum_score = minimum_score def measure(self, test_case: LLMTestCase): @@ -25,15 +26,15 @@ def measure(self, test_case: LLMTestCase): context_list.extend(chunk_text(context)) else: raise ValueError("Context must be a string or a list of strings") - + score = Scorer.factual_consistency_score( contexts=context_list, prediction=test_case.actual_output, - model=self.model_name + model=self.model_name, ) self.score = score self.success = score > self.minimum_score - return score + return score def is_successful(self) -> bool: return self.success diff --git a/deepeval/metrics/non_toxic_metric.py b/deepeval/metrics/non_toxic_metric.py index 94bc15d20..96318bef2 100644 --- a/deepeval/metrics/non_toxic_metric.py +++ b/deepeval/metrics/non_toxic_metric.py @@ -7,9 +7,10 @@ from deepeval.metrics.base_metric import BaseMetric from deepeval.scorer import Scorer + class NonToxicMetric(BaseMetric): def __init__( - self, + self, evaluation_params: List[LLMTestCaseParams], model_name: str = "original", minimum_score: float = 0.5, @@ -40,7 +41,9 @@ def measure(self, test_case: LLMTestCase): for param in self.evaluation_params: text_to_evaluate = getattr(test_case, param.value) - _, results = Scorer.neural_toxic_score(prediction=text_to_evaluate, model=self.model_name) + _, results = Scorer.neural_toxic_score( + prediction=text_to_evaluate, model=self.model_name + ) # sample output # {'toxicity': 0.98057544, # 'severe_toxicity': 0.106649496, diff --git a/deepeval/metrics/unbias_metric.py b/deepeval/metrics/unbias_metric.py index 55cb411ce..945d07c67 100644 --- a/deepeval/metrics/unbias_metric.py +++ b/deepeval/metrics/unbias_metric.py @@ -9,6 +9,7 @@ from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.scorer import Scorer + class UnBiasedMetric(BaseMetric): def __init__( self, @@ -45,7 +46,9 @@ def measure(self, test_case: LLMTestCase, return_all_scores: bool = False): ) # to accumulate all individual results if return_all_scores is True for param in self.evaluation_params: - result = Scorer.neural_bias_score(getattr(test_case, param.value), model=self.model_name) + result = Scorer.neural_bias_score( + getattr(test_case, param.value), model=self.model_name + ) if return_all_scores: all_results.append(result) diff --git a/deepeval/models/__init__.py b/deepeval/models/__init__.py index ee559ac3e..c29c53c14 100644 --- a/deepeval/models/__init__.py +++ b/deepeval/models/__init__.py @@ -6,4 +6,4 @@ from deepeval.models.summac_model import SummaCModels from deepeval.models.factual_consistency_model import FactualConsistencyModel from deepeval.models.detoxify_model import DetoxifyModel -from deepeval.models.unbias_model import UnBiasedModel \ No newline at end of file +from deepeval.models.unbias_model import UnBiasedModel diff --git a/deepeval/models/answer_relevancy_model.py b/deepeval/models/answer_relevancy_model.py index fc9f388d6..88ee30391 100644 --- a/deepeval/models/answer_relevancy_model.py +++ b/deepeval/models/answer_relevancy_model.py @@ -2,10 +2,12 @@ from typing import Optional from deepeval.models.base import DeepEvalBaseModel + def softmax(x): e_x = np.exp(x - np.max(x)) return e_x / e_x.sum(axis=0) + class AnswerRelevancyModel(DeepEvalBaseModel): def __init__(self, model_name: Optional[str] = None): model_name = ( diff --git a/deepeval/models/detoxify_model.py b/deepeval/models/detoxify_model.py index c5a896767..00f72ea8d 100644 --- a/deepeval/models/detoxify_model.py +++ b/deepeval/models/detoxify_model.py @@ -1,14 +1,19 @@ -import torch +import torch from deepeval.models.base import DeepEvalBaseModel from detoxify import Detoxify + class DetoxifyModel(DeepEvalBaseModel): def __init__(self, model_name: str | None = None, *args, **kwargs): if model_name is not None: - assert model_name in ["original", "unbiased", "multilingual"], "Invalid model. Available variants: original, unbiased, multilingual" - model_name = 'original' if model_name is None else model_name + assert model_name in [ + "original", + "unbiased", + "multilingual", + ], "Invalid model. Available variants: original, unbiased, multilingual" + model_name = "original" if model_name is None else model_name super().__init__(model_name, *args, **kwargs) - + def load_model(self): device = "cuda" if torch.cuda.is_available() else "cpu" return Detoxify(self.model_name, device=device) @@ -19,4 +24,3 @@ def _call(self, text: str): toxicity_score_dict ) return mean_toxicity_score, toxicity_score_dict - \ No newline at end of file diff --git a/deepeval/models/factual_consistency_model.py b/deepeval/models/factual_consistency_model.py index b111d0b98..ca5e40e0c 100644 --- a/deepeval/models/factual_consistency_model.py +++ b/deepeval/models/factual_consistency_model.py @@ -1,17 +1,22 @@ -import os +import os from deepeval.models.base import DeepEvalBaseModel from sentence_transformers import CrossEncoder from deepeval.utils import softmax + class FactualConsistencyModel(DeepEvalBaseModel): def __init__(self, model_name: str | None = None, *args, **kwargs): - model_name = "cross-encoder/nli-deberta-v3-large" if model_name is None else model_name + model_name = ( + "cross-encoder/nli-deberta-v3-large" + if model_name is None + else model_name + ) os.environ["TOKENIZERS_PARALLELISM"] = "false" super().__init__(model_name, *args, **kwargs) - + def load_model(self): return CrossEncoder(self.model_name) - + def _call(self, text_a: str, text_b: str): scores = self.model.predict([(text_a, text_b), (text_b, text_a)]) # https://huggingface.co/cross-encoder/nli-deberta-base @@ -20,4 +25,3 @@ def _call(self, text_a: str, text_b: str): score = softmax_scores[0][1] second_score = softmax_scores[1][1] return max(score, second_score) - diff --git a/deepeval/models/hallucination_model.py b/deepeval/models/hallucination_model.py index cd0b0282c..65c4681bf 100644 --- a/deepeval/models/hallucination_model.py +++ b/deepeval/models/hallucination_model.py @@ -1,16 +1,18 @@ import os -from typing import Optional +from typing import Optional from deepeval.singleton import Singleton from sentence_transformers import CrossEncoder from deepeval.progress_context import progress_context class HallucinationModel(metaclass=Singleton): - def __init__( - self, model_name: Optional[str] = None - ): + def __init__(self, model_name: Optional[str] = None): # We use a smple cross encoder model - model_name = "vectara/hallucination_evaluation_model" if model_name is None else model_name + model_name = ( + "vectara/hallucination_evaluation_model" + if model_name is None + else model_name + ) os.environ["TOKENIZERS_PARALLELISM"] = "false" # TODO: add this progress context in the correct place diff --git a/deepeval/models/summac_model.py b/deepeval/models/summac_model.py index c8f3eeff5..7df978794 100644 --- a/deepeval/models/summac_model.py +++ b/deepeval/models/summac_model.py @@ -1,32 +1,64 @@ -import torch -from typing import Union, List +import torch +from typing import Union, List from typing import List, Union, get_origin from deepeval.models.base import DeepEvalBaseModel from deepeval.models._summac_model import _SummaCZS + class SummaCModels(DeepEvalBaseModel): - def __init__(self, model_name: str | None = None, granularity: str | None = None, device: str | None = None, *args, **kwargs): + def __init__( + self, + model_name: str | None = None, + granularity: str | None = None, + device: str | None = None, + *args, + **kwargs + ): model_name = "vitc" if model_name is None else model_name self.granularity = "sentence" if granularity is None else granularity - self.device = device if device is not None else "cuda" if torch.cuda.is_available() else "cpu" + self.device = ( + device + if device is not None + else "cuda" + if torch.cuda.is_available() + else "cpu" + ) super().__init__(model_name, *args, **kwargs) - - def load_model(self, op1: str | None = "max", op2: str | None = "mean", use_ent: bool | None = True, use_con: bool | None = True, image_load_cache: bool | None = True, **kwargs): + + def load_model( + self, + op1: str | None = "max", + op2: str | None = "mean", + use_ent: bool | None = True, + use_con: bool | None = True, + image_load_cache: bool | None = True, + **kwargs + ): return _SummaCZS( model_name=self.model_name, granularity=self.granularity, device=self.device, - op1=op1, op2=op2, use_con=use_con, use_ent=use_ent, + op1=op1, + op2=op2, + use_con=use_con, + use_ent=use_ent, imager_load_cache=image_load_cache, **kwargs ) - - def _call(self, predictions: Union[str, List[str]], targets: Union[str, List[str]]) -> Union[float, dict]: + + def _call( + self, predictions: Union[str, List[str]], targets: Union[str, List[str]] + ) -> Union[float, dict]: list_type = List[str] - if get_origin(predictions) is list_type and get_origin(targets) is list_type: + if ( + get_origin(predictions) is list_type + and get_origin(targets) is list_type + ): return self.model.score(targets, predictions) elif isinstance(predictions, str) and isinstance(targets, str): return self.model.score_one(targets, predictions) else: - raise TypeError('Either both predictions and targets should be List or both should be string') \ No newline at end of file + raise TypeError( + "Either both predictions and targets should be List or both should be string" + ) diff --git a/deepeval/models/unbias_model.py b/deepeval/models/unbias_model.py index d625f3b00..5104b214f 100644 --- a/deepeval/models/unbias_model.py +++ b/deepeval/models/unbias_model.py @@ -1,17 +1,18 @@ from typing import Optional from deepeval.models.base import DeepEvalBaseModel + class UnBiasedModel(DeepEvalBaseModel): def __init__(self, model_name: str | None = None, *args, **kwargs): model_name = "original" if model_name is None else model_name super().__init__(model_name, *args, **kwargs) - + def load_model(self): try: from Dbias.bias_classification import classifier except ImportError as e: print("Run `pip install deepeval[bias]`") return classifier - + def _call(self, text): - return self.model(text) \ No newline at end of file + return self.model(text) diff --git a/deepeval/scorer/scorer.py b/deepeval/scorer/scorer.py index 3037cc140..f7a19f643 100644 --- a/deepeval/scorer/scorer.py +++ b/deepeval/scorer/scorer.py @@ -171,10 +171,15 @@ def bert_score( "bert-recall": recall.detach().numpy().tolist(), "bert-f1": f1.detach().numpy().tolist(), } - + @classmethod def faithfulness_score( - cls, target: str, prediction: str, model: Optional[str] = None, granularity: Optional[str]=None, device: Optional[str]=None + cls, + target: str, + prediction: str, + model: Optional[str] = None, + granularity: Optional[str] = None, + device: Optional[str] = None, ) -> float: """Calculate the faithfulness score of a prediction compared to a target text using SummaCZS. @@ -188,8 +193,8 @@ def faithfulness_score( Returns: float: The computed faithfulness score. Higher values indicate greater faithfulness to the target text. - - Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfullness. + + Right now we are using score_one method under the hood. Instead of scoring multiple predictions for faithfullness. """ try: from deepeval.models import SummaCModels @@ -197,9 +202,7 @@ def faithfulness_score( print(f"SummaCZS model can not be loaded.\n{e}") scorer = SummaCModels( - model_name=model, - granularity=granularity, - device=device + model_name=model, granularity=granularity, device=device ) return scorer(target, prediction)["score"] @@ -337,27 +340,29 @@ def answer_relevancy_score( @classmethod def factual_consistency_score( - cls, contexts: Union[List[str], str], prediction: str, model: Optional[str]=None + cls, + contexts: Union[List[str], str], + prediction: str, + model: Optional[str] = None, ) -> float: try: from deepeval.models import FactualConsistencyModel except Exception as e: print(f"Unable to load FactualConsistencyModel\n{e}") - + scorer = FactualConsistencyModel(model) contexts = [contexts] if isinstance(contexts, str) else contexts - max_score = 0 + max_score = 0 for context in contexts: score = scorer.predict(context, prediction) max_score = max(max_score, score) return max_score - + @classmethod - def neural_bias_score(cls, text: str, model: Optional[str]=None) -> float: + def neural_bias_score(cls, text: str, model: Optional[str] = None) -> float: try: from deepeval.models import UnBiasedModel except Exception as e: print(f"Unable to load UnBiasedModel.\n{e}") scorer = UnBiasedModel(model_name=model) return scorer(text) - \ No newline at end of file