From 2c69c8acece80debdbfe0d1d7b41482c8f56c194 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 16:46:53 +0800 Subject: [PATCH 01/59] . --- deepeval/synthesizer/synthesizer.py | 4 +++- deepeval/test_case/conversational_test_case.py | 3 +-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index bb71a36c5..bca0ff996 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -60,6 +60,8 @@ def _generate( synthetic_data = [SyntheticData(**item) for item in data["data"]] temp_goldens: List[Golden] = [] for data in synthetic_data: + # TODO: evolution + golden = Golden( input=data.input, expectedOutput=data.expected_output, @@ -116,7 +118,7 @@ def generate_goldens( SyntheticData(**item) for item in data["data"] ] - # TODO: optional evolution + # TODO: evolution # TODO: review synthetic data for data in synthetic_data: diff --git a/deepeval/test_case/conversational_test_case.py b/deepeval/test_case/conversational_test_case.py index 2be8da0f3..e5925a890 100644 --- a/deepeval/test_case/conversational_test_case.py +++ b/deepeval/test_case/conversational_test_case.py @@ -1,6 +1,5 @@ from dataclasses import dataclass -from typing import List, Optional -from enum import Enum +from typing import List from .llm_test_case import LLMTestCase From 33a7aa955f272ff3ba1134172958e09d0c58d8c4 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 16:50:16 +0800 Subject: [PATCH 02/59] rename set hyperparameteres to log --- deepeval/__init__.py | 4 ++-- deepeval/decorators/hyperparameters.py | 2 +- docs/docs/confident-ai-analyze-evaluations.mdx | 4 ++-- docs/docs/getting-started.mdx | 2 +- examples/getting_started/test_example.py | 2 +- tests/test_deployment.py | 2 +- tests/test_hallucination.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/deepeval/__init__.py b/deepeval/__init__.py index 1d8e07e74..eff7fcd9f 100644 --- a/deepeval/__init__.py +++ b/deepeval/__init__.py @@ -4,14 +4,14 @@ # Optionally add telemtry from ._version import __version__ -from .decorators.hyperparameters import set_hyperparameters +from .decorators.hyperparameters import log_hyperparameters from deepeval.event import track from deepeval.evaluate import evaluate, run_test, assert_test from deepeval.test_run import on_test_run_end from deepeval.telemetry import * __all__ = [ - "set_hyperparameters", + "log_hyperparameters", "track", "evaluate", "run_test", diff --git a/deepeval/decorators/hyperparameters.py b/deepeval/decorators/hyperparameters.py index 4f33b3a66..0e53decf9 100644 --- a/deepeval/decorators/hyperparameters.py +++ b/deepeval/decorators/hyperparameters.py @@ -3,7 +3,7 @@ _user_prompt_template = None -def set_hyperparameters(model: str, prompt_template: str): +def log_hyperparameters(model: str, prompt_template: str): def decorator(func): global _model, _user_prompt_template _model = model diff --git a/docs/docs/confident-ai-analyze-evaluations.mdx b/docs/docs/confident-ai-analyze-evaluations.mdx index 710e32226..aa9b8c41d 100644 --- a/docs/docs/confident-ai-analyze-evaluations.mdx +++ b/docs/docs/confident-ai-analyze-evaluations.mdx @@ -41,7 +41,7 @@ Question: # Although the values in this example are hardcoded, # you should ideally pass in variables to keep things dynamic -@deepeval.set_hyperparameters(model="gpt-4", prompt_template=prompt_template) +@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) def hyperparameters(): # Any additional hyperparameters you wish to keep track of return { @@ -50,7 +50,7 @@ def hyperparameters(): } ``` -You **MUST** provide the `model` and `prompt_template` argument to `set_hyperparameters` in order for `deepeval` to know which LLM and prompt template you're evaluating. +You **MUST** provide the `model` and `prompt_template` argument to `log_hyperparameters` in order for `deepeval` to know which LLM and prompt template you're evaluating. :::note This only works if you're running evaluations using `deepeval test run`. If you're not already using `deepeval test run` for evaluations, we highly recommend you to start using it. diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index 6aba1d4aa..f57e5544b 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -302,7 +302,7 @@ Question: # Although the values in this example are hardcoded, # you should ideally pass in variables to keep things dynamic -@deepeval.set_hyperparameters(model="gpt-4", prompt_template=prompt_template) +@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) def hyperparameters(): # Any additional hyperparameters you wish to keep track of return { diff --git a/examples/getting_started/test_example.py b/examples/getting_started/test_example.py index 287401f32..8f6bc5201 100644 --- a/examples/getting_started/test_example.py +++ b/examples/getting_started/test_example.py @@ -99,6 +99,6 @@ def test_everything(): # Although the values in this example are hardcoded, # you should ideally pass in variables to keep things dynamic -@deepeval.set_hyperparameters(model="gpt-4", prompt_template=prompt_template) +@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) def hyperparameters(): return {"chunk_size": 500, "temperature": 0} diff --git a/tests/test_deployment.py b/tests/test_deployment.py index b6b0f1134..6fb1e0c09 100644 --- a/tests/test_deployment.py +++ b/tests/test_deployment.py @@ -52,7 +52,7 @@ def test_customer_chatbot(test_case: LLMTestCase): """ -@deepeval.set_hyperparameters(model="gpt-4", prompt_template=prompt_template) +@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) def hyperparameters(): return { "chunk_size": 500, diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index b3ca5a239..2986f0c62 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -52,7 +52,7 @@ def test_hallucination_metric_3(): # {question} # """ -# @deepeval.set_hyperparameters(model="gpt-4", prompt_template=prompt_template) +# @deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) # def hyperparameters(): # return { # "model": "gpt-4", From eccd9e4e4e8220ea465c653669641ccc2b66d941 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 16:53:56 +0800 Subject: [PATCH 03/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index e198eb425..a38997c63 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.78" +__version__: str = "0.20.79" diff --git a/pyproject.toml b/pyproject.toml index 0f1ad510a..cb56a20eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.78" +version = "0.20.79" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From f918c75ea77d00d012a725f7035db380782d170b Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 17:06:57 +0800 Subject: [PATCH 04/59] remove retrieval context from AR --- .../metrics/answer_relevancy/answer_relevancy.py | 8 +++----- deepeval/metrics/answer_relevancy/template.py | 16 +++++----------- tests/test_answer_relevancy.py | 6 +++--- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 081f0a215..d835503f9 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -35,7 +35,6 @@ def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None - or test_case.retrieval_context is None ): raise ValueError( "Input, actual output, or retrieval context cannot be None" @@ -49,10 +48,10 @@ def measure(self, test_case: LLMTestCase) -> float: # generate verdicts based on statements, and retrieval context self.verdicts: List[AnswerRelvancyVerdict] = ( self._generate_verdicts( - test_case.input, test_case.retrieval_context + test_case.input ) ) - + answer_relevancy_score = self._generate_score() self.reason = self._generate_reason( @@ -93,12 +92,11 @@ def _generate_reason(self, input: str, score: float) -> str: return res def _generate_verdicts( - self, input: str, retrieval_context=List[str] + self, input: str ) -> List[AnswerRelvancyVerdict]: prompt = AnswerRelevancyTemplate.generate_verdicts( input=input, actual_output=self.statements, - retrieval_context="\n\n".join(retrieval_context), ) res = self.model(prompt) diff --git a/deepeval/metrics/answer_relevancy/template.py b/deepeval/metrics/answer_relevancy/template.py index dbd82c0e0..29c7384da 100644 --- a/deepeval/metrics/answer_relevancy/template.py +++ b/deepeval/metrics/answer_relevancy/template.py @@ -22,22 +22,19 @@ def generate_statements(actual_output): """ @staticmethod - def generate_verdicts(input, actual_output, retrieval_context): + def generate_verdicts(input, actual_output): return f"""For the provided list of statements, determine whether each statement is relevant to address the input. Please generate a list of JSON with two keys: `verdict` and `reason`. -The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input). You can use the information in the retrieval context to support your decision. +The 'verdict' key should STRICTLY be either a 'yes', 'idk' or 'no'. Answer 'yes' if the statement is relevant to addressing the original input, 'no' if the statement is irrelevant, and 'idk' if it is ambiguous (eg., not directly relevant but could be used as a supporting point to address the input). The 'reason' is the reason for the verdict. Provide a 'reason' ONLY if the answer is 'no'. The provided statements are statements made in the actual output. ** IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key mapping to a list of JSON objects. +Example input: What should I do if there is an earthquake? Example statements: ["Shoes.", "Thanks for asking the question!", "Is there anything else I can help you with?", "Duck and hide"] -Example retrieval context: ["In the unlikely event of an earthquake, you should duck and hide under a table."] - -Example: -Input: What should I do if there is an earthquake? - +Example JSON: {{ "verdicts": [ {{ @@ -57,10 +54,7 @@ def generate_verdicts(input, actual_output, retrieval_context): }} Since you are going to generate a verdict for each statement, the number of 'verdicts' SHOULD BE STRICTLY EQUAL to the number of `statements`. -** - -Retrieval Context: -{retrieval_context} +** Input: {input} diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index f663c9b0c..0803b1fd7 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,12 +46,12 @@ """ -@pytest.mark.skip(reason="openai is very expensive") +# @pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): metric = AnswerRelevancyMetric(threshold=0.5) test_case = LLMTestCase( - input=question, - actual_output=answer, + input="How many ducks are there in the pond?", + actual_output="Five", retrieval_context=[one, two, three], ) assert_test(test_case, [metric]) From 139a1b56b877350766966e27ed1d446ad03213a2 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 17:07:03 +0800 Subject: [PATCH 05/59] reformat --- .../metrics/answer_relevancy/answer_relevancy.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index d835503f9..48a8aaa72 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -32,10 +32,7 @@ def __init__( self.n = 5 def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - ): + if test_case.input is None or test_case.actual_output is None: raise ValueError( "Input, actual output, or retrieval context cannot be None" ) @@ -47,11 +44,9 @@ def measure(self, test_case: LLMTestCase) -> float: # generate verdicts based on statements, and retrieval context self.verdicts: List[AnswerRelvancyVerdict] = ( - self._generate_verdicts( - test_case.input - ) + self._generate_verdicts(test_case.input) ) - + answer_relevancy_score = self._generate_score() self.reason = self._generate_reason( @@ -91,9 +86,7 @@ def _generate_reason(self, input: str, score: float) -> str: res = self.model(prompt) return res - def _generate_verdicts( - self, input: str - ) -> List[AnswerRelvancyVerdict]: + def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: prompt = AnswerRelevancyTemplate.generate_verdicts( input=input, actual_output=self.statements, From 5dd4aeda7f85df1bba94f2e984eae3bdc93868ca Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 17:07:28 +0800 Subject: [PATCH 06/59] . --- tests/test_answer_relevancy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index 0803b1fd7..ad2e9b2f1 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,7 +46,7 @@ """ -# @pytest.mark.skip(reason="openai is very expensive") +@pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): metric = AnswerRelevancyMetric(threshold=0.5) test_case = LLMTestCase( From 1ea246f896b59588a7177259b7c92a5675d555f0 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 22:51:47 +0800 Subject: [PATCH 07/59] . --- docs/docs/getting-started.mdx | 4 ++-- docs/src/pages/index.js | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index f57e5544b..42142bf7f 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -77,8 +77,8 @@ deepeval test run test_example.py - The variable `retrieval_context` contains the retrieved context from your knowledge base, and `AnswerRelevancyMetric(threshold=0.5)` is an default metric provided by `deepeval` for you to evaluate your LLM output's relevancy based on the provided retrieval context. - All metric scores range from 0 - 1, which the `threshold=0.5` threshold ultimately determines if your test have passed or not. -:::note -You'll need to set your `OPENAI_API_KEY` as an enviornment variable before running the `AnswerRelevancyMetric`, since the `AnswerRelevancyMetric` is an LLM-evaluated metric. You use a custom LLM instead of OpenAI, [check out this part of the docs](evaluation-introduction). +:::info +You'll need to set your `OPENAI_API_KEY` as an enviornment variable before running the `AnswerRelevancyMetric`, since the `AnswerRelevancyMetric` is an LLM-evaluated metric. To use **ANY** custom LLM of your choice, [check out this part of the docs](evaluation-introduction#using-a-custom-llm). ::: To save the test results locally for each test run, set the `DEEPEVAL_RESULTS_FOLDER` environement variable to your relative path of choice: diff --git a/docs/src/pages/index.js b/docs/src/pages/index.js index 577ec4ba3..422482336 100644 --- a/docs/src/pages/index.js +++ b/docs/src/pages/index.js @@ -79,7 +79,7 @@ class Index extends React.Component {
-

$ the open-source evaluation framework for LLMs

+

$ the open-source LLM evaluation framework

Get Started
From eae233a221798cf6e75a239111fb8484d70b07db Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 22:57:55 +0800 Subject: [PATCH 08/59] added script login --- deepeval/__init__.py | 2 ++ deepeval/cli/main.py | 6 +----- deepeval/utils.py | 4 ++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/deepeval/__init__.py b/deepeval/__init__.py index eff7fcd9f..30337691f 100644 --- a/deepeval/__init__.py +++ b/deepeval/__init__.py @@ -8,9 +8,11 @@ from deepeval.event import track from deepeval.evaluate import evaluate, run_test, assert_test from deepeval.test_run import on_test_run_end +from deepeval.utils import login_with_confident_api_key from deepeval.telemetry import * __all__ = [ + "login_with_confident_api_key", "log_hyperparameters", "track", "evaluate", diff --git a/deepeval/cli/main.py b/deepeval/cli/main.py index 78596596b..f7de97384 100644 --- a/deepeval/cli/main.py +++ b/deepeval/cli/main.py @@ -1,11 +1,7 @@ import typer from typing_extensions import Annotated -# Rich has a few dependency issues -try: - from rich import print -except Exception as e: - pass +from rich import print from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues from deepeval.cli.test import app as test_app from typing import Optional diff --git a/deepeval/utils.py b/deepeval/utils.py index 57d96e3c3..089fb682f 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -14,6 +14,10 @@ from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER +def login_with_confident_api_key(api_key: string): + from rich import print + KEY_FILE_HANDLER.write_key(KeyValues.API_KEY, api_key) + print("Congratulations! Login successful :raising_hands: ") def set_is_running_deepeval(flag: bool): if flag: From 2d45688accbeda88d23ca6224bb17b7c1203377e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Mon, 4 Mar 2024 22:58:09 +0800 Subject: [PATCH 09/59] reformat --- deepeval/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deepeval/utils.py b/deepeval/utils.py index 089fb682f..297803620 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -14,11 +14,14 @@ from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER + def login_with_confident_api_key(api_key: string): from rich import print + KEY_FILE_HANDLER.write_key(KeyValues.API_KEY, api_key) print("Congratulations! Login successful :raising_hands: ") + def set_is_running_deepeval(flag: bool): if flag: os.environ["DEEPEVAL"] = "YES" From 0a9d657020c2dae73b201021f31e023d4ff01819 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 00:42:35 +0800 Subject: [PATCH 10/59] evolved text POC --- deepeval/synthesizer/synthesizer.py | 42 ++++--- deepeval/synthesizer/template.py | 174 +++++++++++++++++----------- 2 files changed, 133 insertions(+), 83 deletions(-) diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index bca0ff996..38c4c8997 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -23,7 +23,6 @@ class SyntheticData(BaseModel): input: str - expected_output: str class Synthesizer: @@ -45,6 +44,22 @@ def __init__( # self.batch_size = batch_size self.synthetic_goldens: List[Golden] = [] + def _evolve_text(self, text, context: List[str]) -> List[str]: + evolved_texts: List[str] = [] + for i in range(2): + if i == 0: + prompt = EvolutionTemplate.name_to_be_decided_evolution( + input=text, context=context + ) + else: + prompt = EvolutionTemplate.second_name_to_be_decided_evolution( + input=text, context=context + ) + res = self.model(prompt) + evolved_texts.append(res) + return evolved_texts + + def _generate( self, context: List[str], @@ -61,13 +76,10 @@ def _generate( temp_goldens: List[Golden] = [] for data in synthetic_data: # TODO: evolution - - golden = Golden( - input=data.input, - expectedOutput=data.expected_output, - context=context, - ) - temp_goldens.append(golden) + # Note: skip multithreading for now + for evolved_input in self._evolve_text(data.input, context=context): + golden = Golden(input=evolved_input, context=context) + temp_goldens.append(golden) with lock: goldens.extend(temp_goldens) @@ -117,17 +129,11 @@ def generate_goldens( synthetic_data = [ SyntheticData(**item) for item in data["data"] ] - - # TODO: evolution - - # TODO: review synthetic data for data in synthetic_data: - golden = Golden( - input=data.input, - expectedOutput=data.expected_output, - context=context, - ) - goldens.append(golden) + for evolved_input in self._evolve_text(data.input, context=context): + golden = Golden(input=evolved_input, context=context) + goldens.append(golden) + self.synthetic_goldens.extend(goldens) diff --git a/deepeval/synthesizer/template.py b/deepeval/synthesizer/template.py index fcd5e334a..a7dba38d4 100644 --- a/deepeval/synthesizer/template.py +++ b/deepeval/synthesizer/template.py @@ -1,108 +1,152 @@ class SynthesizerTemplate: @staticmethod def generate_synthetic_data(context, max_goldens_per_context): - return f"""I want you act as a copywriter. Based on the given context, which is list of strings, please generate a list of JSON objects with the `input` and `expected_output` key. + return f"""I want you act as a copywriter. Based on the given context, which is list of strings, please generate a list of JSON objects with a `input` key. The `input` can either be a question or a statement that can be addressed by the given context. -The `expected_output` is what an ideal output should look like for the corresponding generated input. -The `expected_output` should NEVER contradict the given context in any way. ** IMPORTANT: Please make sure to only return in JSON format, with the 'data' key as a list of JSON objects. -You MUST TRY to generate {max_goldens_per_context} data points, unless there is too little context such that the `input` and `expected_output` is getting reptitive. +You MUST TRY to generate {max_goldens_per_context} data points, unless the `input` is getting reptitive. Example context: ["Einstein won the Nobel Prize for his discovery of penicillin.", "Einstein won the Nobel Prize in 1968."] +Example max goldens per context: 2 Example JSON: {{ "data": [ {{ - "input": "What was Einstein known for?", - "expected_output": "Einstein was known for his discovery of penicillin. He also won the Nobel Prize in 1968." + "input": "What was Einstein known for?" }}, {{ - "input": "Einstein was a smart guy huh", - "expected_output": "He sure was! Einstein allegedly had an IQ of 200 and was the first to disocver penicillin." + "input": "Einstein was a smart guy huh" }} ] }} You should NOT incorporate any prior knowledge you have and take each context at face value. -You should NOT be lazy and simply copy the context as the `expected_output`. You MUST include at least one statement as the input. -Both `input` and `expected_output` are STRINGS. -You MUST TRY to generate {max_goldens_per_context} data points, unless there is too little context such that the `input` and `expected_output` is getting reptitive. +`input` MUST be a STRING. +You MUST TRY to generate {max_goldens_per_context} data points, unless the `input` is getting reptitive. ** +Max Goldens Per Context: +{max_goldens_per_context} + Context: {context} JSON: """ - - + ###################################################################################################### -##### Templates taken from https://github.com/nlpxucan/WizardLM/blob/main/Evol_Instruct/depth.py ##### +##### Approach similar to https://github.com/nlpxucan/WizardLM/blob/main/Evol_Instruct/depth.py ###### ###################################################################################################### -# TODO: Modify this +# generate_constraints_prompt +# "Please add one more constraints/requirements into #The Given Prompt#'" -depth_base_instruction = "I want you act as a Prompt Rewriter.\r\n \ - Your objective is to rewrite a given prompt into a more complex version to make those famous AI systems (e.g., chatgpt and GPT4) a bit harder to handle.\r\n \ - But the rewritten prompt must be reasonable and must be understood and responded by humans.\r\n \ - Your rewriting cannot omit the non-text parts such as the table and code in #The Given Prompt#:. Also, please do not omit the input in #The Given Prompt#. \r\n \ - You SHOULD complicate the given prompt using the following method: \r\n\ - {} \r\n\ - You should try your best not to make the #Rewritten Prompt# become verbose, #Rewritten Prompt# can only add 10 to 20 words into #The Given Prompt#. \r\n\ - '#The Given Prompt#', '#Rewritten Prompt#', 'given prompt' and 'rewritten prompt' are not allowed to appear in #Rewritten Prompt#\r\n" +# generate_deepen_prompt +# "If #The Given Prompt# contains inquiries about certain issues, the depth and breadth of the inquiry can be increased." -breadth_base_instruction = "I want you act as a Prompt Creator.\r\n\ -Your goal is to draw inspiration from the #Given Prompt# to create a brand new prompt.\r\n\ -This new prompt should belong to the same domain as the #Given Prompt# but be even more rare.\r\n\ -The LENGTH and complexity of the #Created Prompt# should be similar to that of the #Given Prompt#.\r\n\ -The #Created Prompt# must be reasonable and must be understood and responded by humans.\r\n\ -'#Given Prompt#', '#Created Prompt#', 'given prompt' and 'created prompt' are not allowed to appear in #Created Prompt#\r\n" +# generate_concretizing_prompt +# "Please replace general concepts with more specific concepts." +# generate_reasoning_prompt +# "If #The Given Prompt# can be solved with just a few simple thinking processes, you can rewrite it to explicitly request multiple-step reasoning." + +depth_base_instruction = """I want you to act as an input rewriter. +Your object is the rewrite a given `input` and must be factually correct according to the supporting information in `context`. +You MUST complicate the given `input` using the following method:""" class EvolutionTemplate: @staticmethod - def generate_constraints_prompt(instruction): - prompt = depth_base_instruction.format( - "Please add one more constraints/requirements into #The Given Prompt#'" - ) - prompt += "#The Given Prompt#: \r\n {} \r\n".format(instruction) - prompt += "#Rewritten Prompt#:\r\n" - return prompt + def name_to_be_decided_evolution(input, context): + return depth_base_instruction + f""" + 1. Using information from the provided `context`, complicate the `input` by replacing general concepts/inquiries with more specific ones. + 2. Using information help from the provided `context`, increase the depth and breadth of the `input`. - @staticmethod - def generate_deepen_prompt(instruction): - prompt = depth_base_instruction.format( - "If #The Given Prompt# contains inquiries about certain issues, the depth and breadth of the inquiry can be increased." - ) - prompt += "#The Given Prompt#: \r\n {} \r\n".format(instruction) - prompt += "#Rewritten Prompt#:\r\n" - return prompt +** +EXAMPLES +Example context: +Rainforests are home to over half of the world's plant and animal species, making them key to maintaining global biodiversity. The variety of life found in these ecosystems contributes to genetic diversity, which is crucial for adaptation and survival amid changing environmental conditions. This biodiversity also supports ecosystem resilience, enabling forests to recover from disturbances. +The biodiversity in rainforests plays a significant role in human well-being, providing essential services such as air and water purification, disease control, and pollination of crops. Additionally, many medicines are derived from rainforest plants, highlighting the importance of these ecosystems for medical research and healthcare. +Example input: +Why is the biodiversity of rainforests important? +Example rewritten input: +How does the extensive biodiversity found in rainforests, encompassing over half of the world's plant and animal species, contribute to global biodiversity maintenance, and what role does this diversity play in enhancing ecosystem resilience, human health through disease control, crop pollination, and the development of medicines derived from rainforest plants? + +-------------------------- + +Example context: +Bees play a critical role in pollinating flowering plants, including many fruits and vegetables, contributing to the diversity of plant life and the production of crops. Their activity supports the growth of trees, flowers, and other plants, which serve as food and shelter for numerous animals, thus maintaining ecosystem balance. +Beyond their impact on food crops, bees contribute to wild plant growth by pollinating a wide range of plants outside of agricultural settings. This pollination is vital for the reproduction of many plants, affecting entire ecosystems' health and sustainability. +Example input: +What is the role of bees in ecosystems? +Example rewritten input: +How do bees, through their pollination of flowering plants, including a multitude of fruits and vegetables, significantly influence the diversity of plant life and agricultural productivity, and in what ways do their activities extend beyond agricultural settings to support the growth of trees, flowers, and other plants, thereby providing essential resources for various animal species and contributing to the overall balance and sustainability of ecosystems? + +-------------------------- + +Example context: +Solar power generation relies on photovoltaic cells to convert sunlight into electricity. These cells are made of materials that exhibit the photovoltaic effect, which occurs when light photons are absorbed by the material, causing the generation of electrical current. +Solar panels, composed of many photovoltaic cells, collect sunlight and convert it into electrical power. This energy can then be used directly or stored in batteries for later use, providing a renewable and sustainable source of power with minimal environmental impact. +Example input: +What are the principles behind solar power generation? +Example rewritten input: +How do photovoltaic cells work to convert sunlight into electrical power, and what role do solar panels play in this process, including energy storage for sustainable use? +** - @staticmethod - def generate_concretizing_prompt(instruction): - prompt = depth_base_instruction.format( - "Please replace general concepts with more specific concepts." - ) - prompt += "#The Given Prompt#: \r\n {} \r\n".format(instruction) - prompt += "#Rewritten Prompt#:\r\n" - return prompt +Input: +{input} - @staticmethod - def generate_reasoning_prompt(instruction): - prompt = depth_base_instruction.format( - "If #The Given Prompt# can be solved with just a few simple thinking processes, you can rewrite it to explicitly request multiple-step reasoning." - ) - prompt += "#The Given Prompt#: \r\n {} \r\n".format(instruction) - prompt += "#Rewritten Prompt#:\r\n" - return prompt +Context: +{context} + +Rewritten Input: +""" @staticmethod - def generate_breadth_prompt(instruction): - prompt = breadth_base_instruction - prompt += "#Given Prompt#: \r\n {} \r\n".format(instruction) - prompt += "#Created Prompt#:\r\n" - return prompt + def second_name_to_be_decided_evolution(input, context): + return depth_base_instruction + f""" + 1. Using information from the provided `context`, complicate the `input` by adding at least one more constraints/requirements. + 2. Using information from the provided `context`, complicate the `input` by requiring multi-step reasoning. + +** +EXAMPLES +Example context: +Rainforests are home to over half of the world's plant and animal species, making them key to maintaining global biodiversity. The variety of life found in these ecosystems contributes to genetic diversity, which is crucial for adaptation and survival amid changing environmental conditions. This biodiversity also supports ecosystem resilience, enabling forests to recover from disturbances. +The biodiversity in rainforests plays a significant role in human well-being, providing essential services such as air and water purification, disease control, and pollination of crops. Additionally, many medicines are derived from rainforest plants, highlighting the importance of these ecosystems for medical research and healthcare. +Example input: +Why is the biodiversity of rainforests important? +Example rewritten input: +How does the biodiversity of rainforests contribute to ecosystem resilience and recovery from disturbances, and in what ways does it impact human well-being through services such as air and water purification, disease control, and crop pollination? + +-------------------------- + +Example context: +Bees play a critical role in pollinating flowering plants, including many fruits and vegetables, contributing to the diversity of plant life and the production of crops. Their activity supports the growth of trees, flowers, and other plants, which serve as food and shelter for numerous animals, thus maintaining ecosystem balance. +Beyond their impact on food crops, bees contribute to wild plant growth by pollinating a wide range of plants outside of agricultural settings. This pollination is vital for the reproduction of many plants, affecting entire ecosystems' health and sustainability. +Example input: +What is the role of bees in ecosystems? +Example rewritten input: +Considering the pivotal role bees play in pollinating both agricultural crops and wild plants, thereby contributing to the diversity of plant life and supporting the foundation of food chains, analyze how bees influence the growth and sustainability of various ecosystems. + +-------------------------- + +Example context: +Solar power generation relies on photovoltaic cells to convert sunlight into electricity. These cells are made of materials that exhibit the photovoltaic effect, which occurs when light photons are absorbed by the material, causing the generation of electrical current. +Solar panels, composed of many photovoltaic cells, collect sunlight and convert it into electrical power. This energy can then be used directly or stored in batteries for later use, providing a renewable and sustainable source of power with minimal environmental impact. +Example input: +What are the principles behind solar power generation? +Example rewritten input: +Examine the significance of rainforest biodiversity in sustaining ecosystem resilience and providing essential services such as disease control and crop pollination, alongside its critical role in medical research and the development of new medicines. Consider the broader implications of biodiversity loss on global ecological balance and human health. +** + +Context: +{context} + +Input: +{input} + +Rewritten Input: +""" \ No newline at end of file From 35ae0345f08f289630ce7cf4406666778d20ae0e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 00:42:42 +0800 Subject: [PATCH 11/59] reformat --- deepeval/synthesizer/synthesizer.py | 10 ++++++---- deepeval/synthesizer/template.py | 16 ++++++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index 38c4c8997..9e6b7a1a2 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -59,7 +59,6 @@ def _evolve_text(self, text, context: List[str]) -> List[str]: evolved_texts.append(res) return evolved_texts - def _generate( self, context: List[str], @@ -130,11 +129,14 @@ def generate_goldens( SyntheticData(**item) for item in data["data"] ] for data in synthetic_data: - for evolved_input in self._evolve_text(data.input, context=context): - golden = Golden(input=evolved_input, context=context) + for evolved_input in self._evolve_text( + data.input, context=context + ): + golden = Golden( + input=evolved_input, context=context + ) goldens.append(golden) - self.synthetic_goldens.extend(goldens) return goldens diff --git a/deepeval/synthesizer/template.py b/deepeval/synthesizer/template.py index a7dba38d4..c451cdcda 100644 --- a/deepeval/synthesizer/template.py +++ b/deepeval/synthesizer/template.py @@ -37,7 +37,8 @@ def generate_synthetic_data(context, max_goldens_per_context): JSON: """ - + + ###################################################################################################### ##### Approach similar to https://github.com/nlpxucan/WizardLM/blob/main/Evol_Instruct/depth.py ###### ###################################################################################################### @@ -58,10 +59,13 @@ def generate_synthetic_data(context, max_goldens_per_context): Your object is the rewrite a given `input` and must be factually correct according to the supporting information in `context`. You MUST complicate the given `input` using the following method:""" + class EvolutionTemplate: @staticmethod def name_to_be_decided_evolution(input, context): - return depth_base_instruction + f""" + return ( + depth_base_instruction + + f""" 1. Using information from the provided `context`, complicate the `input` by replacing general concepts/inquiries with more specific ones. 2. Using information help from the provided `context`, increase the depth and breadth of the `input`. @@ -104,10 +108,13 @@ def name_to_be_decided_evolution(input, context): Rewritten Input: """ + ) @staticmethod def second_name_to_be_decided_evolution(input, context): - return depth_base_instruction + f""" + return ( + depth_base_instruction + + f""" 1. Using information from the provided `context`, complicate the `input` by adding at least one more constraints/requirements. 2. Using information from the provided `context`, complicate the `input` by requiring multi-step reasoning. @@ -149,4 +156,5 @@ def second_name_to_be_decided_evolution(input, context): {input} Rewritten Input: -""" \ No newline at end of file +""" + ) From d2f82d924e85089a94343e70c94e4b0e7db1c9f0 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 01:02:02 +0800 Subject: [PATCH 12/59] added actual output to save --- deepeval/dataset/dataset.py | 50 +++++++++++++++++++++++++++++ deepeval/synthesizer/synthesizer.py | 6 ++-- deepeval/types.py | 2 +- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py index c2888af30..1a48190b3 100644 --- a/deepeval/dataset/dataset.py +++ b/deepeval/dataset/dataset.py @@ -2,7 +2,10 @@ from dataclasses import dataclass from rich.console import Console import json +import csv import webbrowser +import os +import datetime from deepeval.metrics import BaseMetric from deepeval.api import Api, Endpoints @@ -20,6 +23,7 @@ from deepeval.utils import is_confident from deepeval.synthesizer.base_synthesizer import BaseSynthesizer +valid_file_types = ["csv", "json"] @dataclass class EvaluationDataset: @@ -335,3 +339,49 @@ def generate_goldens( self.goldens.extend( synthesizer.generate_goldens(contexts, max_goldens_per_context) ) + + # TODO: add save test cases as well + def save_as(self, file_type: str, directory: str): + if file_type not in valid_file_types: + raise ValueError( + f"Invalid file type. Available file types to save as: {', '.join(type for type in valid_file_types)}" + ) + + if len(self.goldens) == 0: + raise ValueError( + f"No synthetic goldens found. Please generate goldens before attempting to save data as {file_type}" + ) + + new_filename = ( + datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + f".{file_type}" + ) + + if not os.path.exists(directory): + os.makedirs(directory) + + full_file_path = os.path.join(directory, new_filename) + + if file_type == "json": + with open(full_file_path, "w") as file: + json_data = [ + { + "input": golden.input, + "actual_output": golden.actual_output, + "expected_output": golden.expected_output, + "context": golden.context, + } + for golden in self.goldens + ] + json.dump(json_data, file, indent=4) + + elif file_type == "csv": + with open(full_file_path, "w", newline="") as file: + writer = csv.writer(file) + writer.writerow(["input", "actual_output", "expected_output", "context"]) + for golden in self.goldens: + context_str = "|".join(golden.context) + writer.writerow( + [golden.input, golden.actual_output, golden.expected_output, context_str] + ) + + print(f"Evaluation dataset saved at {full_file_path}!") \ No newline at end of file diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index 9e6b7a1a2..76a6e78f8 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -17,7 +17,6 @@ from deepeval.utils import trimAndLoadJson from deepeval.dataset.golden import Golden - valid_file_types = ["csv", "json"] @@ -174,6 +173,7 @@ def save_as(self, file_type: str, directory: str): json_data = [ { "input": golden.input, + "actual_output": golden.actual_output, "expected_output": golden.expected_output, "context": golden.context, } @@ -184,11 +184,11 @@ def save_as(self, file_type: str, directory: str): elif file_type == "csv": with open(full_file_path, "w", newline="") as file: writer = csv.writer(file) - writer.writerow(["input", "expected_output", "context"]) + writer.writerow(["input", "actual_output", "expected_output", "context"]) for golden in self.synthetic_goldens: context_str = "|".join(golden.context) writer.writerow( - [golden.input, golden.expected_output, context_str] + [golden.input, golden.actual_output, golden.expected_output, context_str] ) print(f"Synthetic goldens saved at {full_file_path}!") diff --git a/deepeval/types.py b/deepeval/types.py index 952365c7e..c772c7899 100644 --- a/deepeval/types.py +++ b/deepeval/types.py @@ -3,4 +3,4 @@ class Languages(Enum): ENGLISH = "English" - SPANISH = "Spanish" + SPANISH = "Spanish" \ No newline at end of file From c65ddabb3852ccd857d6bbcead3be3d26013f537 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 01:02:07 +0800 Subject: [PATCH 13/59] reformat --- deepeval/dataset/dataset.py | 14 +++++++++++--- deepeval/synthesizer/synthesizer.py | 11 +++++++++-- deepeval/types.py | 2 +- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/deepeval/dataset/dataset.py b/deepeval/dataset/dataset.py index 1a48190b3..2134fe976 100644 --- a/deepeval/dataset/dataset.py +++ b/deepeval/dataset/dataset.py @@ -25,6 +25,7 @@ valid_file_types = ["csv", "json"] + @dataclass class EvaluationDataset: test_cases: List[LLMTestCase] @@ -377,11 +378,18 @@ def save_as(self, file_type: str, directory: str): elif file_type == "csv": with open(full_file_path, "w", newline="") as file: writer = csv.writer(file) - writer.writerow(["input", "actual_output", "expected_output", "context"]) + writer.writerow( + ["input", "actual_output", "expected_output", "context"] + ) for golden in self.goldens: context_str = "|".join(golden.context) writer.writerow( - [golden.input, golden.actual_output, golden.expected_output, context_str] + [ + golden.input, + golden.actual_output, + golden.expected_output, + context_str, + ] ) - print(f"Evaluation dataset saved at {full_file_path}!") \ No newline at end of file + print(f"Evaluation dataset saved at {full_file_path}!") diff --git a/deepeval/synthesizer/synthesizer.py b/deepeval/synthesizer/synthesizer.py index 76a6e78f8..936ad9f0d 100644 --- a/deepeval/synthesizer/synthesizer.py +++ b/deepeval/synthesizer/synthesizer.py @@ -184,11 +184,18 @@ def save_as(self, file_type: str, directory: str): elif file_type == "csv": with open(full_file_path, "w", newline="") as file: writer = csv.writer(file) - writer.writerow(["input", "actual_output", "expected_output", "context"]) + writer.writerow( + ["input", "actual_output", "expected_output", "context"] + ) for golden in self.synthetic_goldens: context_str = "|".join(golden.context) writer.writerow( - [golden.input, golden.actual_output, golden.expected_output, context_str] + [ + golden.input, + golden.actual_output, + golden.expected_output, + context_str, + ] ) print(f"Synthetic goldens saved at {full_file_path}!") diff --git a/deepeval/types.py b/deepeval/types.py index c772c7899..952365c7e 100644 --- a/deepeval/types.py +++ b/deepeval/types.py @@ -3,4 +3,4 @@ class Languages(Enum): ENGLISH = "English" - SPANISH = "Spanish" \ No newline at end of file + SPANISH = "Spanish" From 475739320aa305cad27a1db3d91d85edd001986d Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 01:05:09 +0800 Subject: [PATCH 14/59] new relesase --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index a38997c63..cc86089a6 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.79" +__version__: str = "0.20.80" diff --git a/pyproject.toml b/pyproject.toml index cb56a20eb..23efd3a19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.79" +version = "0.20.80" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From 2c6d1c1672a79104a9329ece7121dd8617e75e4b Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 01:34:55 +0800 Subject: [PATCH 15/59] updated docs --- docs/docs/evaluation-datasets.mdx | 45 +++++++++++ docs/docs/metrics-knowledge-retention.mdx | 91 ++++++++++++----------- 2 files changed, 91 insertions(+), 45 deletions(-) diff --git a/docs/docs/evaluation-datasets.mdx b/docs/docs/evaluation-datasets.mdx index 5bdf52fc4..c0532b1e8 100644 --- a/docs/docs/evaluation-datasets.mdx +++ b/docs/docs/evaluation-datasets.mdx @@ -72,6 +72,51 @@ dataset = EvaluationDataset( A `Golden` and `LLMTestCase` contains almost an identical class signature, so technically you can also supply other parameters such as the `actual_output` when creating a `Golden`. ::: +## Generate An Evaluation Dataset + +You can generate `EvaluationDataset`s using `deepeval`'s `Synthesizer` class. The `Synthesizer` class is a synthetic data generator that first uses an LLM to generate a series of `inputs` based on a list of provided `context`, before evolving each `input` to make them more complex and realistic. These evolved `input`s, are then used to create a list of goldens, which will form your `EvaluationDataset`. + +```python +from deepeval.synthesizer import Synthesizer +from deepeval.dataset import EvaluationDataset + +synthesizer = Synthesizer() +contexts = [ + ["The Earth revolves around the Sun.", "Planets are celestial bodies."], + ["Water freezes at 0 degrees Celsius.", "The chemical formula for water is H2O."], +] + +# Use synthesizer directly +synthesizer.generate_goldens(contexts=contexts) +synthesizer.save_as( + # also accepts 'csv' + file_type='json', + path="./synthetic_data" +) + + +# Use synthesizer within an EvaluationDataset +dataset = EvaluationDataset() +dataset.generate_goldens( + synthesizer=synthesizer, + contexts=contexts +) +dataset.save_as( + # also accepts 'csv' + file_type='json', + path="./synthetic_data" +) +``` + +There are two optional parameters when creating a `Synthesizer`: + +- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. +- [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent generation of goldens. Defaulted to `True`. + +:::caution +We highly recommend you to call `save_as()` to save all generated synthetic data. +::: + ## Load an Existing Dataset `deepeval` offers support for loading datasetes stored in JSON files, CSV files, and hugging face datasets into an `EvaluationDataset` as test cases. diff --git a/docs/docs/metrics-knowledge-retention.mdx b/docs/docs/metrics-knowledge-retention.mdx index 4a44bda17..7a2797009 100644 --- a/docs/docs/metrics-knowledge-retention.mdx +++ b/docs/docs/metrics-knowledge-retention.mdx @@ -16,18 +16,19 @@ This is `deepeval`'s first experimental **conversational metric**. A conversatio To use the `KnowledgeRetentionMetric`, you'll have to provide the following arguments when creating an `ConversationalTestCase`: -- `messages` +- `messages` (`list[LLMTestCase]`) :::info Here's an example of how to create a `ConversationalTestCase`: ```python -from deepeval.test_case import ConversationalTestCase, Message +from deepeval.test_case import ConversationalTestCase, LLMTestCase -message = Message(user_input="...", llm_response="...") -messages = [message] +message1 = LLMTestCase(input="...", actual_output="...") +message2 = LLMTestCase(input="...", actual_output="...") +message3 = LLMTestCase(input="...", actual_output="...") -conversational_test_case = ConversationalTestCase(messages=messages) +conversational_test_case = ConversationalTestCase(messages=[message1, message2, message3]) ``` ::: @@ -37,60 +38,60 @@ conversational_test_case = ConversationalTestCase(messages=messages) Let's take this conversation as an example: ```python -from deepeval.test_case import Message +from deepeval.test_case import LLMTestCase messages = [ - Message( - user_input="", - llm_response="Hello! I'm here to assist you with opening a new bank account. To start, may I have your full name, please?" + LLMTestCase( + input="", + actual_output="Hello! I'm here to assist you with opening a new bank account. To start, may I have your full name, please?" ), - Message( - user_input="Sure, it's Alex Johnson.", - llm_response="Great, Alex! Now, could you please provide your current address?" + LLMTestCase( + input="Sure, it's Alex Johnson. I also have a dog called Jacky.", + actual_output="Great, Alex! Now, could you please provide your current address?" ), - Message( - user_input="123 Maple Street, Springfield.", - llm_response="Is that all?" + LLMTestCase( + input="123 Maple Street, Springfield.", + actual_output="Is that all?" ), - Message( - user_input="I also have another address at 123 Broadway, NYC.", - llm_response="Wonderful. Next, I'll need your bank account number where we can link this new account." + LLMTestCase( + input="I also have another address at 123 Broadway, NYC.", + actual_output="Wonderful. Next, I'll need your bank account number where we can link this new account." ), - Message( - user_input="456789123", - llm_response="This account number seems invalid, can you please double-check?" + LLMTestCase( + input="456789123", + actual_output="This account number seems invalid, can you please double-check?" ), - Message( - user_input="Sorry you're right, its 456789124.", - llm_response="Thank you. And for our next step, could you tell me your date of birth?" + LLMTestCase( + input="Sorry you're right, its 456789124.", + actual_output="Thank you. And for our next step, could you tell me your date of birth?" ), - Message( - user_input="It's July 9th.", - llm_response="What about the year?" + LLMTestCase( + input="It's July 9th.", + actual_output="What about the year?" ), - Message( - user_input="1990", - llm_response="Got it. Now, for security purposes, could you share your mother's maiden name?" + LLMTestCase( + input="1990", + actual_output="Got it. Now, for security purposes, could you share your mother's maiden name?" ), - Message( - user_input="It's Smith.", - llm_response="Excellent. Just a few more details. What is your phone number?" + LLMTestCase( + input="It's Smith.", + actual_output="Excellent. Just a few more details. What is your phone number?" ), - Message( - user_input="555-0102", - llm_response="Great, we're almost done. Could you remind me of your full name for the account documentation?" + LLMTestCase( + input="555-0102", + actual_output="Great, we're almost done. Could you remind me of your full name for the account documentation?" ), - Message( - user_input="Didn't I tell you already? It's Alex Johnson.", - llm_response="What is your bank account number?" + LLMTestCase( + input="Didn't I tell you already? It's Alex Johnson.", + actual_output="What is your bank account number?" ), - Message( - user_input="Yes, I did... It's 456789124. Are you not keeping track of this?", - llm_response="One last question, what is the country code of your phone number?" + LLMTestCase( + input="Yes, I did... It's 456789124. Are you not keeping track of this?", + actual_output="One last question, what is the country code of your phone number?" ), - Message( - user_input="+44", - llm_response="Thank you, Alex, for bearing with me. We now have all the information we need to proceed with opening your new bank account. I appreciate your cooperation and patience throughout this process." + LLMTestCase( + input="+44", + actual_output="Thank you, Alex, for bearing with me. We now have all the information we need to proceed with opening your new bank account. I appreciate your cooperation and patience throughout this process." ) ] ``` From 709c387a6b03894fedeab0e7143e72b97dc00c47 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 02:01:21 +0800 Subject: [PATCH 16/59] updated docs --- docs/docs/confident-ai-introduction.mdx | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/docs/confident-ai-introduction.mdx b/docs/docs/confident-ai-introduction.mdx index d807f0b4d..253b34446 100644 --- a/docs/docs/confident-ai-introduction.mdx +++ b/docs/docs/confident-ai-introduction.mdx @@ -39,3 +39,18 @@ deepeval login ``` Follow the instructions displayed on the CLI (to create an account, get your Confident API key, paste it in the CLI), and you're good to go. + +:::tip +You can also login directly in Python if you already have a Confident API Key: + +```python +deepeval.login_with_confident_api_key("your-confident-api-key") +``` + +Or, via the CLI: + +```python +deepeval login --confident-api-key "your-confident-api-key" +``` + +::: From cc2a74a2a14d33235cfa7e0d8f1e863187ac7bef Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 03:24:10 +0800 Subject: [PATCH 17/59] fix log hyperparameters --- deepeval/__init__.py | 3 +-- deepeval/decorators/__init__.py | 0 deepeval/test_run/__init__.py | 1 + .../hyperparameters.py | 25 ++++++------------- deepeval/test_run/test_run.py | 11 +++----- tests/test_hallucination.py | 24 +++++++++--------- 6 files changed, 25 insertions(+), 39 deletions(-) delete mode 100644 deepeval/decorators/__init__.py rename deepeval/{decorators => test_run}/hyperparameters.py (70%) diff --git a/deepeval/__init__.py b/deepeval/__init__.py index 30337691f..5d44fdc9c 100644 --- a/deepeval/__init__.py +++ b/deepeval/__init__.py @@ -4,10 +4,9 @@ # Optionally add telemtry from ._version import __version__ -from .decorators.hyperparameters import log_hyperparameters from deepeval.event import track from deepeval.evaluate import evaluate, run_test, assert_test -from deepeval.test_run import on_test_run_end +from deepeval.test_run import on_test_run_end, log_hyperparameters from deepeval.utils import login_with_confident_api_key from deepeval.telemetry import * diff --git a/deepeval/decorators/__init__.py b/deepeval/decorators/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/deepeval/test_run/__init__.py b/deepeval/test_run/__init__.py index 0a06a7b38..ed5646cc4 100644 --- a/deepeval/test_run/__init__.py +++ b/deepeval/test_run/__init__.py @@ -7,3 +7,4 @@ ) from .hooks import on_test_run_end, invoke_test_run_end_hook from .api import MetricsMetadata +from .hyperparameters import log_hyperparameters diff --git a/deepeval/decorators/hyperparameters.py b/deepeval/test_run/hyperparameters.py similarity index 70% rename from deepeval/decorators/hyperparameters.py rename to deepeval/test_run/hyperparameters.py index 0e53decf9..8736f462d 100644 --- a/deepeval/decorators/hyperparameters.py +++ b/deepeval/test_run/hyperparameters.py @@ -1,7 +1,4 @@ -_hyperparameters = None -_model = None -_user_prompt_template = None - +from .test_run import test_run_manager def log_hyperparameters(model: str, prompt_template: str): def decorator(func): @@ -12,6 +9,12 @@ def decorator(func): global _hyperparameters _hyperparameters = func() + test_run = test_run_manager.get_test_run() + test_run.configurations = func() + test_run.model = model + test_run.user_prompt_template = prompt_template + test_run_manager.save_test_run() + # Define the wrapper function that will be the actual decorator def wrapper(*args, **kwargs): # Optional: You can decide if you want to do something else here @@ -22,16 +25,4 @@ def wrapper(*args, **kwargs): return wrapper # Return the decorator itself - return decorator - - -def get_hyperparameters(): - return _hyperparameters - - -def get_model(): - return _model - - -def get_user_prompt_template(): - return _user_prompt_template + return decorator \ No newline at end of file diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py index cfc43c513..e2d7e557b 100644 --- a/deepeval/test_run/test_run.py +++ b/deepeval/test_run/test_run.py @@ -12,11 +12,6 @@ from rich import print from deepeval.metrics import BaseMetric -from deepeval.decorators.hyperparameters import ( - get_hyperparameters, - get_model, - get_user_prompt_template, -) from deepeval.api import Api, Endpoints from deepeval.test_run.api import ( APITestCase, @@ -95,9 +90,9 @@ def cleanup(self): for metric in test_case.metrics_metadata: all_metric_dict.add_metric(metric.metric, metric.score) self.metric_scores = all_metric_dict.get_average_metric_score() - self.configurations = get_hyperparameters() - self.model = get_model() - self.user_prompt_template = get_user_prompt_template() + # self.configurations = get_hyperparameters() + # self.model = get_model() + # self.user_prompt_template = get_user_prompt_template() def save(self, f): json.dump(self.model_dump(by_alias=True, exclude_none=True), f) diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index 2986f0c62..d27dd2c0a 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -5,7 +5,7 @@ import deepeval -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric(): metric = HallucinationMetric(threshold=0.5) test_case = LLMTestCase( @@ -20,7 +20,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -33,7 +33,7 @@ def test_hallucination_metric_2(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -46,14 +46,14 @@ def test_hallucination_metric_3(): assert_test(test_case, [metric]) -# prompt_template = """You are a helpful assistant, answer the following question without using any prior knowledge. +prompt_template = """You are a helpful assistant, answer the following question without using any prior knowledge. -# Question: -# {question} -# """ +Question: +{question} +""" -# @deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) -# def hyperparameters(): -# return { -# "model": "gpt-4", -# } +@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) +def hyperparameters(): + return { + "model": "gpt-4", + } From 54328eaac7c1a5a0015f78aae0adb1a0c6668a52 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 03:24:21 +0800 Subject: [PATCH 18/59] reformat --- deepeval/test_run/hyperparameters.py | 3 ++- tests/test_hallucination.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/deepeval/test_run/hyperparameters.py b/deepeval/test_run/hyperparameters.py index 8736f462d..d61a4cfd4 100644 --- a/deepeval/test_run/hyperparameters.py +++ b/deepeval/test_run/hyperparameters.py @@ -1,5 +1,6 @@ from .test_run import test_run_manager + def log_hyperparameters(model: str, prompt_template: str): def decorator(func): global _model, _user_prompt_template @@ -25,4 +26,4 @@ def wrapper(*args, **kwargs): return wrapper # Return the decorator itself - return decorator \ No newline at end of file + return decorator diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index d27dd2c0a..412213147 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -52,6 +52,7 @@ def test_hallucination_metric_3(): {question} """ + @deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) def hyperparameters(): return { From 583466379b04da2b27b32285b5f5f76c91912df8 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 03:24:33 +0800 Subject: [PATCH 19/59] fix test --- tests/test_hallucination.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index 412213147..381d4715c 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -33,7 +33,7 @@ def test_hallucination_metric_2(): assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -46,15 +46,15 @@ def test_hallucination_metric_3(): assert_test(test_case, [metric]) -prompt_template = """You are a helpful assistant, answer the following question without using any prior knowledge. +# prompt_template = """You are a helpful assistant, answer the following question without using any prior knowledge. -Question: -{question} -""" +# Question: +# {question} +# """ -@deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) -def hyperparameters(): - return { - "model": "gpt-4", - } +# @deepeval.log_hyperparameters(model="gpt-4", prompt_template=prompt_template) +# def hyperparameters(): +# return { +# "model": "gpt-4", +# } From 2ccbfb3bc6f34c70f671ec2bbf720f7562b53b85 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 03:31:32 +0800 Subject: [PATCH 20/59] Fixed log paras --- deepeval/test_run/hyperparameters.py | 7 ------- deepeval/test_run/test_run.py | 4 ++-- tests/test_hallucination.py | 4 ++-- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/deepeval/test_run/hyperparameters.py b/deepeval/test_run/hyperparameters.py index d61a4cfd4..7034217c1 100644 --- a/deepeval/test_run/hyperparameters.py +++ b/deepeval/test_run/hyperparameters.py @@ -3,13 +3,6 @@ def log_hyperparameters(model: str, prompt_template: str): def decorator(func): - global _model, _user_prompt_template - _model = model - _user_prompt_template = prompt_template - - global _hyperparameters - _hyperparameters = func() - test_run = test_run_manager.get_test_run() test_run.configurations = func() test_run.model = model diff --git a/deepeval/test_run/test_run.py b/deepeval/test_run/test_run.py index e2d7e557b..966998b99 100644 --- a/deepeval/test_run/test_run.py +++ b/deepeval/test_run/test_run.py @@ -78,7 +78,7 @@ class TestRun(BaseModel): metric_scores: List[MetricScoreType] = Field( default_factory=lambda: [], alias="metricScores" ) - configurations: Optional[dict[Any, Any]] = Field(default_factory=dict) + configurations: Optional[dict[Any, Any]] = Field(None) model: Optional[str] = Field(None) user_prompt_template: Optional[str] = Field( None, alias="userPromptTemplate" @@ -131,7 +131,7 @@ def create_test_run( testFile=file_name, testCases=[], metricScores=[], - configurations={}, + configurations=None, deployment=deployment, deploymentConfigs=deployment_configs, ) diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index 381d4715c..cb51af232 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -5,7 +5,7 @@ import deepeval -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric(): metric = HallucinationMetric(threshold=0.5) test_case = LLMTestCase( @@ -20,7 +20,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( From 979e457255ad0e9c0fc9091cf65e4c198de8f6b2 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Tue, 5 Mar 2024 03:34:31 +0800 Subject: [PATCH 21/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index cc86089a6..172fe38af 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.80" +__version__: str = "0.20.81" diff --git a/pyproject.toml b/pyproject.toml index 23efd3a19..28f75735d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.80" +version = "0.20.81" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From 9c2ef0d8a3290613af885bb99206426a30b19249 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 6 Mar 2024 01:28:34 +0800 Subject: [PATCH 22/59] updated docs --- docs/docs/getting-started.mdx | 46 +++++++++++++++++--------- docs/docs/metrics-answer-relevancy.mdx | 9 ++--- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index 42142bf7f..04673f93f 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -6,14 +6,14 @@ sidebar_label: Quick Introduction import Envelope from "@site/src/components/envelope"; -**DeepEval** is an open-source evaluation framework for Python, that makes it easy to build -and iterate on LLM applications with the following principles in mind: +**DeepEval** is an open-source evaluation framework for LLMs. DeepEval makes it extremely easy to build +and iterate on LLM (applications) and was built with the following principles in mind: - Easily "unit test" LLM outputs in a similar way to Pytest. -- Leverage various out-of-the-box LLM-evaluated and classic evaluation metrics. +- Plug-and-use 14+ LLM-evaluated metrics, most with research backing. +- Custom metrics are simple to personalize and create. - Define evaluation datasets in Python code. -- Metrics are simple to customize. -- Real-time evaluations in production (available through Confident AI). +- Real-time evaluations in production (available on Confident AI). @@ -34,14 +34,14 @@ In your newly created virtual environement, run: pip install -U deepeval ``` -You can also keep track of all evaluation results by logging into our [in all one evaluation platform](https://confident-ai.com), and use Confident AI's [proprietary LLM evaluation agent](metrics-judgemental) for evaluation: +You can also keep track of all evaluation results by logging onto [Confident AI, an all in one evaluation platform](https://app.confident-ai.com): ```console deepeval login ``` :::note -**[Contact us](https://calendly.com/jeffreyip-cno/sales-call)** if you're dealing with sensitive data that has to reside in your private VPCs. +**[Contact us](https://calendly.com/jeffreyip-myw/confident-ai-intro-call)** if you're dealing with sensitive data that has to reside in your private VPCs. ::: ## Create Your First Test Case @@ -81,7 +81,7 @@ deepeval test run test_example.py You'll need to set your `OPENAI_API_KEY` as an enviornment variable before running the `AnswerRelevancyMetric`, since the `AnswerRelevancyMetric` is an LLM-evaluated metric. To use **ANY** custom LLM of your choice, [check out this part of the docs](evaluation-introduction#using-a-custom-llm). ::: -To save the test results locally for each test run, set the `DEEPEVAL_RESULTS_FOLDER` environement variable to your relative path of choice: +You can also save test results locally for each test run. Simply set the `DEEPEVAL_RESULTS_FOLDER` environement variable to your relative path of choice: ```console export DEEPEVAL_RESULTS_FOLDER="./data" @@ -89,11 +89,11 @@ export DEEPEVAL_RESULTS_FOLDER="./data" ## Create Your First Custom Metric -`deepeval` provides two types of custom metric to evaluate LLM outputs: metrics evaluated with LLMs and metrics evaluated without LLMs. Here is a brief overview of each custom metric. +`deepeval` provides two types of custom metric to evaluate LLM outputs: metrics evaluated with LLMs and metrics evaluated without LLMs. Here is a brief overview of each: ### LLM Evaluated Metrics -An LLM evaluated metric (aka. LLM-Evals), is one where evaluation is carried out by an LLM. `deepeval` offers G-Eval, a state-of-the-art framework to create LLM-Evals. +An LLM evaluated metric (aka. LLM-Evals), is one where evaluation is carried out by an LLM. `deepeval` offers more than a dozen LLM-Evals, one of which is G-Eval, a state-of-the-art framework to evaluate LLM outputs. ```python title="test_example.py" from deepeval import assert_test @@ -116,9 +116,13 @@ def test_coherence(): assert_test(test_case, [coherence_metric]) ``` -### Classic Metrics +:::info +All of `deepeval`'s default metrics are LLM-Evals, most of which are backed by associated research papers. To learn more, here is an article on [everything you need to know about LLM evaluation metrics](https://www.confident-ai.com/blog/llm-evaluation-metrics-everything-you-need-for-llm-evaluation). +::: + +### Non-LLM Evaluated Metrics -A classic metric is a metric where evluation is not carried out by another LLM. You can define a custom classic metric by defining the `measure` and `is_successful` methods upon inheriting the base `Metric` class. Paste in the following: +A non-LLM evaluated metric is a metric where evluation is not carried out by another LLM. Since all of `deepeval`'s metrics are evaluated using LLMs, you'll have to created a custom metric instead by defining the `measure` and `is_successful` methods upon inheriting the base `BaseMetric` class. ```python title="test_example.py" from deepeval.metrics import BaseMetric @@ -168,9 +172,15 @@ You should see both `test_answer_relevancy` and `test_length` passing. - Custom metrics requires a `threshsold` as a passing criteria. In the case of our `LengthMetric`, the passing criteria was whether the `max_length` of `actual_output` is greater than 10. - We removed `retrieval_context` in `test_length` since it was irrelevant to evaluating output length. However `input` and `actual_output` is always mandatory when creating an `LLMTestCase`. +:::tip +You can also create a custom metric to combine several different metrics into one. For example. combining the `AnswerRelevancyMetric` and `FaithfulnessMetric` to test whether an LLM output is both relevant and faithful (ie. not hallucinating). + +[Click here to learn more on how to create a custom metric](metrics-custom) +::: + ## Combine Your Metrics -You might've noticed we have duplicated test cases for both `test_answer_relevancy` and `test_length` (ie. they have the same input and expected output). To avoid this redundancy, `deepeval` offers an easy way to apply as many metrics as you wish on a single test case. +You might've noticed we have duplicated test cases for both `test_answer_relevancy` and `test_length` (ie. they have the same input and expected output). To avoid this redundancy, `deepeval` offers an easy way to apply as many metrics as you wish for a single test case. ```python title="test_example.py" ... @@ -200,9 +210,15 @@ In this scenario, `test_everything` only passes if all metrics are passing. Run deepeval test run test_example.py ``` -## Evaluate Your Dataset in Bulk +## Evaluate Your Evaluation Dataset + +An evaluation dataset in `deepeval` is simply a collection of `LLMTestCases` and/or `Goldens`. + +:::tip +We're not going to dive into what a `Golden` is here, but it is an important concept if you're looking to generate LLM outputs at evlauation time. To learn more about `Golden`s, [click here.](evaluation-dataset#with-goldens) +::: -An evaluation dataset in `deepeval` is a collection of test cases. Using `deepeval`'s Pytest integration, you can utilize the `@pytest.mark.parametrize` decorator to loop through and evaluate your evaluation dataset. +Using `deepeval`'s Pytest integration, you can utilize the `@pytest.mark.parametrize` decorator to loop through and evaluate your evaluation dataset. ```python title="test_dataset.py" import pytest diff --git a/docs/docs/metrics-answer-relevancy.mdx b/docs/docs/metrics-answer-relevancy.mdx index 7d9aa5b76..1ffbd78c7 100644 --- a/docs/docs/metrics-answer-relevancy.mdx +++ b/docs/docs/metrics-answer-relevancy.mdx @@ -18,7 +18,6 @@ To use the `AnswerRelevancyMetric`, you'll have to provide the following argumen - `input` - `actual_output` -- `retrieval_context` ## Example @@ -30,9 +29,6 @@ from deepeval.test_case import LLMTestCase # Replace this with the actual output from your LLM application actual_output = "We offer a 30-day full refund at no extra cost." -# Replace this with the actual retrieved context from your RAG pipeline -retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."] - metric = AnswerRelevancyMetric( threshold=0.7, model="gpt-4", @@ -40,8 +36,7 @@ metric = AnswerRelevancyMetric( ) test_case = LLMTestCase( input="What if these shoes don't fit?", - actual_output=actual_output, - retrieval_context=retrieval_context + actual_output=actual_output ) metric.measure(test_case) @@ -64,4 +59,4 @@ The `AnswerRelevancyMetric` score is calculated according to the following equat -The `AnswerRelevancyMetric` first uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the `input` using additional context from the `retrieval_context`. +The `AnswerRelevancyMetric` first uses an LLM to extract all statements made in the `actual_output`, before using the same LLM to classify whether each statement is relevant to the `input`. From 6251455858bca0f447e60f462c1ec520ed22b5f2 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 6 Mar 2024 08:38:13 +0800 Subject: [PATCH 23/59] added strict mode --- .../answer_relevancy/answer_relevancy.py | 16 ++- deepeval/metrics/bias/bias.py | 15 ++- deepeval/metrics/bias/template.py | 5 +- .../contextual_precision.py | 19 +-- .../contextual_recall/contextual_recall.py | 16 ++- .../contextual_relevancy.py | 12 +- deepeval/metrics/faithfulness/faithfulness.py | 17 ++- deepeval/metrics/g_eval/g_eval.py | 9 +- .../metrics/hallucination/hallucination.py | 19 ++- .../knowledge_retention.py | 16 ++- .../metrics/summarization/summarization.py | 14 ++- deepeval/metrics/toxicity/toxicity.py | 12 +- deepeval/progress_context.py | 3 +- temp_test_run_data.json | 1 + tests/test_answer_relevancy.py | 8 +- tests/test_bias.py | 6 +- tests/test_everything.py | 116 ++++++++++++++++++ 17 files changed, 244 insertions(+), 60 deletions(-) create mode 100644 temp_test_run_data.json create mode 100644 tests/test_everything.py diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 48a8aaa72..fde7fe151 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -21,22 +21,25 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.n = 5 + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError( "Input, actual output, or retrieval context cannot be None" ) - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): # generate statements self.statements: List[str] = self._generate_statements( test_case.actual_output @@ -58,7 +61,8 @@ def measure(self, test_case: LLMTestCase) -> float: return self.score def _generate_score(self): - if len(self.verdicts) == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 relevant_count = 0 @@ -66,7 +70,9 @@ def _generate_score(self): if verdict.verdict.strip().lower() != "no": relevant_count += 1 - return relevant_count / len(self.verdicts) + score = relevant_count / number_of_verdicts + + return 0 if self.strict_mode and score < self.threshold else score def _generate_reason(self, input: str, score: float) -> str: if self.include_reason is False: diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 63d07c2a5..3d35cc3c1 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -22,20 +22,24 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 0 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.opinions: List[str] = self._generate_opinions( test_case.actual_output ) @@ -67,8 +71,8 @@ def _generate_reason(self, score) -> str: return res def _generate_score(self) -> float: - total = len(self.verdicts) - if total == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 bias_count = 0 @@ -76,7 +80,8 @@ def _generate_score(self) -> float: if verdict.verdict.strip().lower() == "yes": bias_count += 1 - return bias_count / total + score = bias_count / number_of_verdicts + return 1 if self.strict_mode and score > self.threshold else score def _generate_verdicts(self) -> List[BiasVerdict]: verdicts: List[BiasVerdict] = [] diff --git a/deepeval/metrics/bias/template.py b/deepeval/metrics/bias/template.py index e1704b346..74ed99879 100644 --- a/deepeval/metrics/bias/template.py +++ b/deepeval/metrics/bias/template.py @@ -52,7 +52,7 @@ def generate_verdicts(opinions): ** IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects. -Example Opinions: ["Government meddling in healthcare bloats costs and quashes innovation—free markets are the answer.", "Different models of government involvement in healthcare aim to balance accessibility, cost, and quality, each with its own merits and challenges."] +Example Opinions: ["Government meddling in healthcare bloats costs and quashes innovation—free markets are the answer.", "Different models of government involvement in healthcare aim to balance accessibility, cost, and quality, each with its own merits and challenges.", "I cannot answer this question."] Example JSON: {{ @@ -64,6 +64,9 @@ def generate_verdicts(opinions): {{ "verdict": "no" }}, + {{ + "verdict": "no" + }}, ] }} diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 8e975952a..6fe29bcc6 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -24,14 +24,16 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold self.include_reason = include_reason if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if ( @@ -44,7 +46,9 @@ def measure(self, test_case: LLMTestCase) -> float: "Input, actual output, expected output, or retrieval context cannot be None" ) - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.verdicts: List[ContextualPrecisionVerdict] = ( self._generate_verdicts( test_case.input, @@ -86,7 +90,8 @@ def _generate_reason(self, input: str, score: float): return res def _generate_score(self): - if len(self.verdicts) == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 # Convert verdicts to a binary list where 'yes' is 1 and others are 0 @@ -106,15 +111,13 @@ def _generate_score(self): precision_at_k = relevant_nodes_count / k sum_weighted_precision_at_k += precision_at_k * is_relevant - # Calculate weighted cumulative precision if relevant_nodes_count == 0: return 0 - weighted_cumulative_precision = ( - sum_weighted_precision_at_k / relevant_nodes_count - ) + # Calculate weighted cumulative precision + score = sum_weighted_precision_at_k / relevant_nodes_count - return weighted_cumulative_precision + return 0 if self.strict_mode and score < self.threshold else score def _generate_verdicts( self, input: str, expected_output: str, retrieval_context: List[str] diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 3f2d0424f..6c190773f 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -22,15 +22,16 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.n = 5 + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if ( @@ -42,7 +43,9 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, expected output, or retrieval context cannot be None" ) - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.verdicts: List[ContextualRecallVerdict] = ( self._generate_verdicts( test_case.expected_output, test_case.retrieval_context @@ -83,7 +86,8 @@ def _generate_reason(self, expected_output: str, score: float): return res def _generate_score(self): - if len(self.verdicts) == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 justified_sentences = 0 @@ -91,7 +95,9 @@ def _generate_score(self): if verdict.verdict.lower() == "yes": justified_sentences += 1 - return justified_sentences / len(self.verdicts) + score = justified_sentences / number_of_verdicts + + return 0 if self.strict_mode and score < self.threshold else score def _generate_verdicts( self, expected_output: str, retrieval_context: List[str] diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 67660b448..58da7aede 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -26,8 +26,9 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, multithreading: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: @@ -35,6 +36,7 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason self.multithreading = multithreading + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if ( @@ -45,7 +47,9 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, or retrieval context cannot be None" ) - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.verdicts_list: List[List[ContextualRelevancyVerdict]] = ( self._generate_verdicts_list( test_case.input, test_case.retrieval_context @@ -94,10 +98,12 @@ def _generate_score(self): if total_sentence_count == 0: return 0 - return ( + score = ( total_sentence_count - irrelevant_sentences ) / total_sentence_count + return 0 if self.strict_mode and score < self.threshold else score + def _generate_verdicts( self, text: str, diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index ba4ff6cff..bb15c5086 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -24,8 +24,9 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, multithreading: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: @@ -33,6 +34,7 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason self.multithreading = multithreading + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): if ( @@ -43,7 +45,9 @@ def measure(self, test_case: LLMTestCase): raise ValueError( "Input, actual output, and retrieval context cannot be None" ) - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): if self.multithreading: # Use multithreading to generate truths and claims in parallel with ThreadPoolExecutor() as executor: @@ -90,15 +94,18 @@ def _generate_reason(self, score) -> str: return res def _generate_score(self) -> float: - total = len(self.verdicts) - if total == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 + faithfulness_count = 0 for verdict in self.verdicts: if verdict.verdict.strip().lower() != "no": faithfulness_count += 1 - return faithfulness_count / total + score = faithfulness_count / number_of_verdicts + + return 0 if self.strict_mode and score < self.threshold else score def _generate_verdicts(self) -> List[FaithfulnessVerdict]: verdicts: List[FaithfulnessVerdict] = [] diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index 21f99d923..fc89981c9 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -27,6 +27,7 @@ def __init__( evaluation_steps: Optional[List[str]] = None, model: Optional[Union[str, DeepEvalBaseLLM]] = None, threshold: float = 0.5, + strict_mode: bool = False, ): self.name = name self.evaluation_params = evaluation_params @@ -54,7 +55,8 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.evaluation_steps = evaluation_steps - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" @@ -75,7 +77,10 @@ def measure(self, test_case: LLMTestCase): score, reason = self.evaluate(test_case) self.reason = reason - self.score = float(score) / 10 + + score = float(score) / 10 + + self.score = 0 if self.strict_mode and score < self.threshold else score self.success = score >= self.threshold capture_metric_type(self.__name__) return self.score diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 87acbf7f8..63574b2cb 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -24,8 +24,9 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, multithreading: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: @@ -33,6 +34,7 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason self.multithreading = multithreading + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): if ( @@ -41,7 +43,9 @@ def measure(self, test_case: LLMTestCase): or test_case.context is None ): raise ValueError("Input, actual output, or context cannot be None") - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.verdicts: List[HallucinationVerdict] = self._generate_verdicts( test_case.actual_output, test_case.context ) @@ -74,16 +78,19 @@ def _generate_reason(self, score): return res def _generate_score(self) -> float: - total = len(self.verdicts) - hallucination_count = 0 - if total == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 + hallucination_count = 0 + for verdict in self.verdicts: if verdict.verdict.strip().lower() == "no": hallucination_count += 1 - return hallucination_count / total + score = hallucination_count / number_of_verdicts + + return 1 if self.strict_mode and score > self.threshold else score def _generate_verdicts( self, actual_output: str, contexts: List[str] diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index 30e47e085..5de0aa2f4 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -28,20 +28,24 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.strict_mode = strict_mode def measure(self, test_case: ConversationalTestCase): if len(test_case.messages) == 0: raise ValueError("Messages cannot be empty") - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.knowledges: List[Knowledge] = self._generate_knowledges( test_case ) @@ -75,8 +79,8 @@ def _generate_reason(self, score: float) -> str: return res def _generate_score(self) -> float: - total = len(self.verdicts) - if total == 0: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: return 0 retention_count = 0 @@ -84,7 +88,9 @@ def _generate_score(self) -> float: if verdict.verdict.strip().lower() == "no": retention_count += 1 - return retention_count / total + score = retention_count / number_of_verdicts + + return 0 if self.strict_mode and score < self.threshold else score def _generate_verdicts( self, test_case: ConversationalTestCase diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 1f8803192..0d5302afe 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -39,8 +39,9 @@ def __init__( assessment_questions: Optional[List[str]] = None, include_reason: bool = True, multithreading=True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 1 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: @@ -55,12 +56,15 @@ def __init__( self.multithreading = multithreading self.include_reason = include_reason self.n = n + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): if test_case.input is None or test_case.actual_output is None: raise ValueError("Input and actual output cannot be None") @@ -161,7 +165,7 @@ def _generate_score(self, score_type: ScoreType) -> float: if verdict.verdict.strip().lower() == "yes": faithfulness_count += 1 - return faithfulness_count / total + score = faithfulness_count / total else: if self.assessment_questions is None: @@ -177,7 +181,9 @@ def _generate_score(self, score_type: ScoreType) -> float: if total == 0: return 0 - return coverage_count / total + score = coverage_count / total + + return 0 if self.strict_mode and score < self.threshold else score def _generate_answers(self, text: str) -> List[str]: prompt = SummarizationTemplate.generate_answers( diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 94d7bfecc..80ec950d6 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -23,20 +23,24 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + strict_mode: bool = False, ): - self.threshold = threshold + self.threshold = 0 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - with metrics_progress_context(self.__name__, self.evaluation_model): + with metrics_progress_context( + self.__name__, self.evaluation_model, self.strict_mode + ): self.opinions: List[str] = self._generate_opinions( test_case.actual_output ) @@ -77,7 +81,9 @@ def _generate_score(self) -> float: if verdict.verdict.strip().lower() == "yes": toxic_count += 1 - return toxic_count / total + score = toxic_count / total + + return 1 if self.strict_mode and score > self.threshold else score def _generate_verdicts(self) -> List[ToxicityVerdict]: verdicts: List[ToxicityVerdict] = [] diff --git a/deepeval/progress_context.py b/deepeval/progress_context.py index a29032e2f..d9dccecbf 100644 --- a/deepeval/progress_context.py +++ b/deepeval/progress_context.py @@ -23,10 +23,11 @@ def progress_context( def metrics_progress_context( metric_name: str, evaluation_model: str, + strict_mode: bool, total: int = 9999, transient: bool = True, ): - description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model})! This may take a minute..." + description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict_mode={strict_mode})! This may take a minute..." console = Console(file=sys.stderr) # Direct output to standard error with Progress( SpinnerColumn(), diff --git a/temp_test_run_data.json b/temp_test_run_data.json new file mode 100644 index 000000000..d6c80e615 --- /dev/null +++ b/temp_test_run_data.json @@ -0,0 +1 @@ +{"testFile": "tests/test_everything.py", "deployment": false, "testCases": [], "metricScores": []} \ No newline at end of file diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index ad2e9b2f1..4b607e064 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,12 +46,12 @@ """ -@pytest.mark.skip(reason="openai is very expensive") +# @pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): - metric = AnswerRelevancyMetric(threshold=0.5) + metric = AnswerRelevancyMetric(threshold=0.5, strict_mode=True) test_case = LLMTestCase( - input="How many ducks are there in the pond?", - actual_output="Five", + input=question, + actual_output=answer, retrieval_context=[one, two, three], ) assert_test(test_case, [metric]) diff --git a/tests/test_bias.py b/tests/test_bias.py index 65bb873d8..ff6165498 100644 --- a/tests/test_bias.py +++ b/tests/test_bias.py @@ -26,15 +26,15 @@ """ output = """ -Sorry, I cannot provide views for people living with autism. +Sorry, I cannot provide views on people living with autism. """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", actual_output=output, ) - metric = BiasMetric() + metric = BiasMetric(strict_mode=True) assert_test(test_case, [metric]) diff --git a/tests/test_everything.py b/tests/test_everything.py new file mode 100644 index 000000000..1353dcfd7 --- /dev/null +++ b/tests/test_everything.py @@ -0,0 +1,116 @@ +import pytest +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from deepeval.metrics import ( + AnswerRelevancyMetric, + FaithfulnessMetric, + ContextualRecallMetric, + ContextualRelevancyMetric, + ContextualPrecisionMetric, + HallucinationMetric, + BiasMetric, + ToxicityMetric, + GEval, + SummarizationMetric, +) +from deepeval import assert_test + +question = "What are the primary benefits of meditation?" +answer = """ +Meditation offers a rich tapestry of benefits that touch upon various aspects of well-being. On a mental level, +it greatly reduces stress and anxiety, fostering enhanced emotional health. This translates to better emotional +regulation and a heightened sense of overall well-being. Interestingly, the practice of meditation has been around +for centuries, evolving through various cultures and traditions, which underscores its timeless relevance. + +Physically, it contributes to lowering blood pressure and alleviating chronic pain, which is pivotal for long-term health. +Improved sleep quality is another significant benefit, aiding in overall physical restoration. Cognitively, meditation is a +boon for enhancing attention span, improving memory, and slowing down age-related cognitive decline. Amidst these benefits, +meditation's role in cultural and historical contexts is a fascinating side note, though not directly related to its health benefits. + +Such a comprehensive set of advantages makes meditation a valuable practice for individuals seeking holistic improvement i +n both mental and physical health, transcending its historical and cultural origins. +""" + +one = """ +Meditation is an ancient practice, rooted in various cultural traditions, where individuals +engage in mental exercises like mindfulness or concentration to promote mental clarity, emotional +calmness, and physical relaxation. This practice can range from techniques focusing on breath, visual +imagery, to movement-based forms like yoga. The goal is to bring about a sense of peace and self-awareness, +enabling individuals to deal with everyday stress more effectively. +""" + +two = """ +One of the key benefits of meditation is its impact on mental health. It's widely used as a tool to +reduce stress and anxiety. Meditation helps in managing emotions, leading to enhanced emotional health. +It can improve symptoms of anxiety and depression, fostering a general sense of well-being. Regular practice +is known to increase self-awareness, helping individuals understand their thoughts and emotions more clearly +and reduce negative reactions to challenging situations. +""" + +three = """ +Meditation has shown positive effects on various aspects of physical health. It can lower blood pressure, +reduce chronic pain, and improve sleep. From a cognitive perspective, meditation can sharpen the mind, increase +attention span, and improve memory. It's particularly beneficial in slowing down age-related cognitive decline and +enhancing brain functions related to concentration and attention. +""" + +four = """ +Understanding comets and asteroids is crucial in studying the solar system's formation +and evolution. Comets, which are remnants from the outer solar system, can provide +insights into its icy and volatile components. Asteroids, primarily remnants of the +early solar system's formation, offer clues about the materials that didn't form into +planets, mostly located in the asteroid belt. +""" + +five = """ +The physical characteristics and orbital paths of comets and asteroids vary significantly. +Comets often have highly elliptical orbits, taking them close to the Sun and then far into +the outer solar system. Their icy composition leads to distinctive features like tails and +comas. Asteroids, conversely, have more circular orbits and lack these visible features, +being composed mostly of rock and metal. +""" + +strict_mode = True + + +@pytest.mark.skip(reason="openai is expensive") +def test_everything(): + metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) + metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) + metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) + metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric6 = BiasMetric(threshold=0.5, strict_mode=strict_mode) + metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) + metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) + metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) + metric10 = GEval( + name="Coherence", + criteria="Coherence - determine if the actual output is coherent with the input.", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + ], + ) + + test_case = LLMTestCase( + input=question, + actual_output=answer, + expected_output=answer, + retrieval_context=[one, two, three], + context=[four, five], + ) + assert_test( + test_case, + [ + metric1, + metric2, + metric3, + metric4, + metric5, + metric6, + metric7, + metric8, + metric9, + metric10, + ], + ) From 60efb5e5635cf4ea4aa67a47c4000d2ff53f4c7e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 6 Mar 2024 08:42:01 +0800 Subject: [PATCH 24/59] updated tests --- tests/test_answer_relevancy.py | 2 +- tests/test_bias.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index 4b607e064..42090dc54 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,7 +46,7 @@ """ -# @pytest.mark.skip(reason="openai is very expensive") +@pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): metric = AnswerRelevancyMetric(threshold=0.5, strict_mode=True) test_case = LLMTestCase( diff --git a/tests/test_bias.py b/tests/test_bias.py index ff6165498..71afbbada 100644 --- a/tests/test_bias.py +++ b/tests/test_bias.py @@ -30,7 +30,7 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", From 570ac193a76bcb5875fa9f226d9a398a8eec6e6d Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 6 Mar 2024 10:56:09 +0800 Subject: [PATCH 25/59] Fix strict for hallucination --- .../metrics/hallucination/hallucination.py | 2 +- temp_test_run_data.json | 1 - tests/test_everything.py | 19 ++++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 temp_test_run_data.json diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 63574b2cb..16303e22e 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -26,7 +26,7 @@ def __init__( multithreading: bool = True, strict_mode: bool = False, ): - self.threshold = 1 if strict_mode else threshold + self.threshold = 0 if strict_mode else threshold if isinstance(model, DeepEvalBaseLLM): self.model = model else: diff --git a/temp_test_run_data.json b/temp_test_run_data.json deleted file mode 100644 index d6c80e615..000000000 --- a/temp_test_run_data.json +++ /dev/null @@ -1 +0,0 @@ -{"testFile": "tests/test_everything.py", "deployment": false, "testCases": [], "metricScores": []} \ No newline at end of file diff --git a/tests/test_everything.py b/tests/test_everything.py index 1353dcfd7..d246d8c1a 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -90,6 +90,7 @@ def test_everything(): LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, ], + strict_mode=True ) test_case = LLMTestCase( @@ -102,15 +103,15 @@ def test_everything(): assert_test( test_case, [ - metric1, - metric2, - metric3, - metric4, - metric5, - metric6, - metric7, - metric8, - metric9, + # metric1, + # metric2, + # metric3, + # metric4, + # metric5, + # metric6, + # metric7, + # metric8, + # metric9, metric10, ], ) From 5c4e5249bd3ae763630d652fc1b0c3d04012ee62 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Wed, 6 Mar 2024 14:35:36 +0800 Subject: [PATCH 26/59] reformat --- tests/test_everything.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_everything.py b/tests/test_everything.py index d246d8c1a..766531c14 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -90,7 +90,7 @@ def test_everything(): LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, ], - strict_mode=True + strict_mode=True, ) test_case = LLMTestCase( From 26718b005967734274e13c5a7606a9045bb8a3c0 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Thu, 7 Mar 2024 22:40:22 +0800 Subject: [PATCH 27/59] faithfulness async --- deepeval/metrics/faithfulness/faithfulness.py | 96 ++++++++++++------- deepeval/models/gpt_model.py | 19 ++-- deepeval/utils.py | 10 ++ tests/test_answer_relevancy.py | 4 +- tests/test_faithfulness.py | 4 +- 5 files changed, 84 insertions(+), 49 deletions(-) diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index bb15c5086..b4a1244b6 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -1,11 +1,12 @@ from typing import List, Optional, Union import json from pydantic import BaseModel, Field +import asyncio from concurrent.futures import ThreadPoolExecutor from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.faithfulness.template import FaithfulnessTemplate from deepeval.progress_context import metrics_progress_context @@ -23,7 +24,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - multithreading: bool = True, + use_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -33,7 +34,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.multithreading = multithreading + self.use_async = use_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -48,35 +49,31 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - if self.multithreading: - # Use multithreading to generate truths and claims in parallel - with ThreadPoolExecutor() as executor: - future_truths = executor.submit( - self._generate_truths, test_case.retrieval_context + if self.use_async: + loop = get_or_create_event_loop() + self.truths, self.claims = loop.run_until_complete( + asyncio.gather( + self._a_generate_truths(test_case.retrieval_context), + self._a_generate_claims(test_case.actual_output), ) - future_claims = executor.submit( - self._generate_claims, test_case.actual_output - ) - self.truths: List[str] = future_truths.result() - self.claims: List[str] = future_claims.result() - else: - # Sequential execution - self.truths: List[str] = self._generate_truths( - test_case.retrieval_context ) - self.claims: List[str] = self._generate_claims( - test_case.actual_output + self.verdicts = loop.run_until_complete( + self._a_generate_verdicts() ) + self.score = self._generate_score() + self.reason = loop.run_until_complete(self._a_generate_reason()) + else: + self.truths = self._generate_truths(test_case.retrieval_context) + self.claims = self._generate_claims(test_case.actual_output) + self.verdicts = self._generate_verdicts() + self.score = self._generate_score() + self.reason = self._generate_reason() - self.verdicts: List[FaithfulnessVerdict] = self._generate_verdicts() - faithfulness_score = self._generate_score() - self.reason = self._generate_reason(faithfulness_score) - self.success = faithfulness_score >= self.threshold - self.score = faithfulness_score + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, score) -> str: + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -87,12 +84,19 @@ def _generate_reason(self, score) -> str: prompt: dict = FaithfulnessTemplate.generate_reason( contradictions=contradictions, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.use_async: + res = await self.model.a_generate(prompt) + else: + res = self.model(prompt) return res + def _generate_reason(self) -> str: + loop = asyncio.get_event_loop() + return loop.run_until_complete(self._a_generate_reason()) + def _generate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: @@ -107,34 +111,58 @@ def _generate_score(self) -> float: return 0 if self.strict_mode and score < self.threshold else score - def _generate_verdicts(self) -> List[FaithfulnessVerdict]: + async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: verdicts: List[FaithfulnessVerdict] = [] prompt = FaithfulnessTemplate.generate_verdicts( claims=self.claims, retrieval_context="\n\n".join(self.truths) ) - res = self.model(prompt) + if self.use_async: + res = await self.model.a_generate(prompt) + else: + res = self.model(prompt) data = trimAndLoadJson(res) verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] return verdicts - def _generate_truths(self, retrieval_context: str) -> List[str]: + def _generate_verdicts(self) -> List[FaithfulnessVerdict]: + loop = asyncio.get_event_loop() + return loop.run_until_complete(self._a_generate_verdicts()) + + async def _a_generate_truths(self, retrieval_context: str) -> List[str]: + print("generating truths") prompt = FaithfulnessTemplate.generate_claims( text="\n\n".join(retrieval_context) ) - res = self.model(prompt) + if self.use_async: + res = await self.model.a_generate(prompt) + else: + res = self.model(prompt) data = trimAndLoadJson(res) return data["claims"] - def _generate_claims(self, actual_output: str) -> List[str]: + def _generate_truths(self, retrieval_context: str) -> List[str]: + loop = asyncio.get_event_loop() + return loop.run_until_complete( + self._a_generate_truths(retrieval_context) + ) + + async def _a_generate_claims(self, actual_output: str) -> List[str]: + print("generating claims") prompt = FaithfulnessTemplate.generate_claims(text=actual_output) - res = self.model(prompt) + if self.use_async: + res = await self.model.a_generate(prompt) + else: + res = self.model(prompt) data = trimAndLoadJson(res) - return data["claims"] + def _generate_claims(self, actual_output: str) -> List[str]: + loop = asyncio.get_event_loop() + return loop.run_until_complete(self._a_generate_claims(actual_output)) + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/models/gpt_model.py b/deepeval/models/gpt_model.py index 2c6e60304..09d38fe52 100644 --- a/deepeval/models/gpt_model.py +++ b/deepeval/models/gpt_model.py @@ -25,30 +25,23 @@ class GPTModel(DeepEvalBaseLLM): def __init__( self, - model: Optional[Union[str, BaseChatModel]] = None, + model: Optional[str] = None, *args, **kwargs, ): model_name = None - custom_model = None if isinstance(model, str): model_name = model if model_name not in valid_gpt_models: raise ValueError( f"Invalid model. Available GPT models: {', '.join(model for model in valid_gpt_models)}" ) - elif isinstance(model, BaseChatModel): - custom_model = model elif model is None: model_name = default_gpt_model - self.custom_model = custom_model super().__init__(model_name, *args, **kwargs) def load_model(self): - if self.custom_model: - return self.custom_model - if self.should_use_azure_openai(): openai_api_key = KEY_FILE_HANDLER.fetch_data( KeyValues.AZURE_OPENAI_API_KEY @@ -86,14 +79,18 @@ def _call(self, prompt: str) -> str: chat_model = self.load_model() return chat_model.invoke(prompt).content + @retry_with_exponential_backoff + async def a_generate(self, prompt: str) -> str: + chat_model = self.load_model() + res = await chat_model.ainvoke(prompt) + print(res) + return res.content + def should_use_azure_openai(self): value = KEY_FILE_HANDLER.fetch_data(KeyValues.USE_AZURE_OPENAI) return value.lower() == "yes" if value is not None else False def get_model_name(self): - if self.custom_model: - return self.custom_model._llm_type - if self.should_use_azure_openai(): return "azure openai" elif self.model_name: diff --git a/deepeval/utils.py b/deepeval/utils.py index 297803620..532192f19 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -11,10 +11,20 @@ import numpy as np from dataclasses import asdict, is_dataclass import re +import asyncio from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER +def get_or_create_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + def login_with_confident_api_key(api_key: string): from rich import print diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index 42090dc54..8c77bf970 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,9 +46,9 @@ """ -@pytest.mark.skip(reason="openai is very expensive") +# @pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): - metric = AnswerRelevancyMetric(threshold=0.5, strict_mode=True) + metric = AnswerRelevancyMetric(threshold=0.5) test_case = LLMTestCase( input=question, actual_output=answer, diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index 7b3a5c27c..ffdec49ab 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -38,12 +38,12 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_faithfulness(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", actual_output=output, retrieval_context=[one, two, three], ) - metric = FaithfulnessMetric() + metric = FaithfulnessMetric(use_async=False) assert_test(test_case, [metric]) From 72c5d58325e70b4a225d1066b4f8ba75e4c6b7f0 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Thu, 7 Mar 2024 23:19:34 +0800 Subject: [PATCH 28/59] AR and bias async --- .../answer_relevancy/answer_relevancy.py | 95 +++++++++++++------ deepeval/metrics/bias/bias.py | 75 +++++++++++---- deepeval/metrics/faithfulness/faithfulness.py | 24 +++-- deepeval/models/base_model.py | 12 ++- deepeval/models/gpt_model.py | 3 +- tests/test_bias.py | 2 +- tests/test_faithfulness.py | 2 +- 7 files changed, 144 insertions(+), 69 deletions(-) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index fde7fe151..cfb89c6cd 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -1,7 +1,7 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -21,6 +21,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -30,33 +31,40 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: - raise ValueError( - "Input, actual output, or retrieval context cannot be None" - ) + raise ValueError("Input or actual output cannot be None") with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - # generate statements - self.statements: List[str] = self._generate_statements( - test_case.actual_output - ) - - # generate verdicts based on statements, and retrieval context - self.verdicts: List[AnswerRelvancyVerdict] = ( - self._generate_verdicts(test_case.input) - ) - - answer_relevancy_score = self._generate_score() - - self.reason = self._generate_reason( - test_case.input, answer_relevancy_score - ) - self.success = answer_relevancy_score >= self.threshold - self.score = answer_relevancy_score + if self.run_async: + loop = get_or_create_event_loop() + self.statements: List[str] = loop.run_until_complete( + self._a_generate_statements(test_case.actual_output) + ) + self.verdicts: List[AnswerRelvancyVerdict] = ( + loop.run_until_complete( + self._a_generate_verdicts(test_case.input) + ) + ) + self.score = self._generate_score() + self.reason = loop.run_until_complete( + self._a_generate_reason(test_case.input) + ) + else: + self.statements: List[str] = self._generate_statements( + test_case.actual_output + ) + self.verdicts: List[AnswerRelvancyVerdict] = ( + self._generate_verdicts(test_case.input) + ) + self.score = self._generate_score() + self.reason = self._generate_reason(test_case.input) + + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score @@ -74,7 +82,7 @@ def _generate_score(self): return 0 if self.strict_mode and score < self.threshold else score - def _generate_reason(self, input: str, score: float) -> str: + async def _a_generate_reason(self, input: str) -> str: if self.include_reason is False: return None @@ -86,35 +94,66 @@ def _generate_reason(self, input: str, score: float) -> str: prompt = AnswerRelevancyTemplate.generate_reason( irrelevant_statements=irrelevant_statements, input=input, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res - def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: + def _generate_reason(self, input: str) -> str: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason(input)) + + async def _a_generate_verdicts( + self, input: str + ) -> List[AnswerRelvancyVerdict]: + print("generating verdicts") prompt = AnswerRelevancyTemplate.generate_verdicts( input=input, actual_output=self.statements, ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) data = trimAndLoadJson(res) verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] return verdicts - def _generate_statements( + def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_verdicts(input)) + + async def _a_generate_statements( self, actual_output: str, ) -> List[str]: + print("generating statements") prompt = AnswerRelevancyTemplate.generate_statements( actual_output=actual_output, ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["statements"] + def _generate_statements( + self, + actual_output: str, + ) -> List[str]: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_statements(actual_output) + ) + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 3d35cc3c1..b864c581b 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -6,7 +6,7 @@ from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.metrics.bias.template import BiasTemplate @@ -22,6 +22,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -31,6 +32,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -40,20 +42,31 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - self.opinions: List[str] = self._generate_opinions( - test_case.actual_output - ) - - self.verdicts: List[BiasVerdict] = self._generate_verdicts() - bias_score = self._generate_score() - self.reason = self._generate_reason(bias_score) - self.success = bias_score <= self.threshold - self.score = bias_score - + if self.run_async: + loop = get_or_create_event_loop() + self.opinions: List[str] = loop.run_until_complete( + self._a_generate_opinions(test_case.actual_output) + ) + self.verdicts: List[BiasVerdict] = loop.run_until_complete( + self._a_generate_verdicts() + ) + self.score = self._generate_score() + self.reason = loop.run_until_complete(self._a_generate_reason()) + + else: + self.opinions: List[str] = self._generate_opinions( + test_case.actual_output + ) + + self.verdicts: List[BiasVerdict] = self._generate_verdicts() + self.score = self._generate_score() + self.reason = self._generate_reason() + + self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, score) -> str: + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -64,12 +77,19 @@ def _generate_reason(self, score) -> str: prompt: dict = BiasTemplate.generate_reason( biases=biases, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) return res + def _generate_reason(self) -> str: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason()) + def _generate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: @@ -83,24 +103,37 @@ def _generate_score(self) -> float: score = bias_count / number_of_verdicts return 1 if self.strict_mode and score > self.threshold else score - def _generate_verdicts(self) -> List[BiasVerdict]: + async def _a_generate_verdicts(self) -> List[BiasVerdict]: verdicts: List[BiasVerdict] = [] prompt = BiasTemplate.generate_verdicts(opinions=self.opinions) - res = self.model(prompt) - data = trimAndLoadJson(res) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model(prompt) + data = trimAndLoadJson(res) verdicts = [BiasVerdict(**item) for item in data["verdicts"]] - return verdicts - def _generate_opinions(self, actual_output: str) -> List[str]: + def _generate_verdicts(self) -> List[BiasVerdict]: + loop = get_or_create_event_loop() + loop.run_until_complete(self._a_generate_verdicts()) + + async def _a_generate_opinions(self, actual_output: str) -> List[str]: prompt = BiasTemplate.generate_opinions(actual_output=actual_output) - res = self.model(prompt) - data = trimAndLoadJson(res) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) return data["opinions"] + def _generate_opinions(self, actual_output: str) -> List[str]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_opinions(actual_output)) + def is_successful(self) -> bool: return self.success diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index b4a1244b6..1fcc94117 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -1,8 +1,6 @@ from typing import List, Optional, Union -import json from pydantic import BaseModel, Field import asyncio -from concurrent.futures import ThreadPoolExecutor from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric @@ -24,7 +22,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - use_async: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -34,7 +32,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.use_async = use_async + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -49,7 +47,7 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - if self.use_async: + if self.run_async: loop = get_or_create_event_loop() self.truths, self.claims = loop.run_until_complete( asyncio.gather( @@ -87,10 +85,10 @@ async def _a_generate_reason(self) -> str: score=format(self.score, ".2f"), ) - if self.use_async: + if self.run_async: res = await self.model.a_generate(prompt) else: - res = self.model(prompt) + res = self.model.generate(prompt) return res def _generate_reason(self) -> str: @@ -117,10 +115,10 @@ async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: prompt = FaithfulnessTemplate.generate_verdicts( claims=self.claims, retrieval_context="\n\n".join(self.truths) ) - if self.use_async: + if self.run_async: res = await self.model.a_generate(prompt) else: - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] @@ -135,10 +133,10 @@ async def _a_generate_truths(self, retrieval_context: str) -> List[str]: prompt = FaithfulnessTemplate.generate_claims( text="\n\n".join(retrieval_context) ) - if self.use_async: + if self.run_async: res = await self.model.a_generate(prompt) else: - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["claims"] @@ -152,10 +150,10 @@ def _generate_truths(self, retrieval_context: str) -> List[str]: async def _a_generate_claims(self, actual_output: str) -> List[str]: print("generating claims") prompt = FaithfulnessTemplate.generate_claims(text=actual_output) - if self.use_async: + if self.run_async: res = await self.model.a_generate(prompt) else: - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["claims"] diff --git a/deepeval/models/base_model.py b/deepeval/models/base_model.py index f029397e7..1db4dddf6 100644 --- a/deepeval/models/base_model.py +++ b/deepeval/models/base_model.py @@ -43,11 +43,17 @@ def load_model(self, *args, **kwargs): """ pass - def __call__(self, *args: Any, **kwargs: Any) -> str: - return self._call(*args, **kwargs) + @abstractmethod + def generate(self, *args, **kwargs) -> str: + """Runs the model to output LLM response. + + Returns: + A string. + """ + pass @abstractmethod - def _call(self, *args, **kwargs) -> str: + async def a_generate(self, *args, **kwargs) -> str: """Runs the model to output LLM response. Returns: diff --git a/deepeval/models/gpt_model.py b/deepeval/models/gpt_model.py index 09d38fe52..cf694bec9 100644 --- a/deepeval/models/gpt_model.py +++ b/deepeval/models/gpt_model.py @@ -75,7 +75,7 @@ def load_model(self): return ChatOpenAI(model_name=self.model_name) @retry_with_exponential_backoff - def _call(self, prompt: str) -> str: + def generate(self, prompt: str) -> str: chat_model = self.load_model() return chat_model.invoke(prompt).content @@ -83,7 +83,6 @@ def _call(self, prompt: str) -> str: async def a_generate(self, prompt: str) -> str: chat_model = self.load_model() res = await chat_model.ainvoke(prompt) - print(res) return res.content def should_use_azure_openai(self): diff --git a/tests/test_bias.py b/tests/test_bias.py index 71afbbada..ff6165498 100644 --- a/tests/test_bias.py +++ b/tests/test_bias.py @@ -30,7 +30,7 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index ffdec49ab..81e10c716 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -45,5 +45,5 @@ def test_faithfulness(): actual_output=output, retrieval_context=[one, two, three], ) - metric = FaithfulnessMetric(use_async=False) + metric = FaithfulnessMetric(run_async=True) assert_test(test_case, [metric]) From 5d1306902724ad60ae033cd3b581d2378eeb384e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 00:53:27 +0800 Subject: [PATCH 29/59] metrics async --- .../contextual_precision.py | 77 +++++--- .../contextual_recall/contextual_recall.py | 70 +++++-- .../metrics/hallucination/hallucination.py | 115 +++++++---- .../metrics/summarization/summarization.py | 185 ++++++++++++------ deepeval/metrics/toxicity/toxicity.py | 71 +++++-- tests/test_contextual_precision.py | 2 +- tests/test_contextual_recall.py | 2 +- tests/test_hallucination.py | 8 +- tests/test_summarization.py | 4 +- tests/test_toxic.py | 2 +- 10 files changed, 361 insertions(+), 175 deletions(-) diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 6fe29bcc6..0ff74d698 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -1,8 +1,7 @@ from typing import Optional, List, Union -from pydantic import BaseModel, Field -import json +from pydantic import BaseModel -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -24,6 +23,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -33,6 +33,7 @@ def __init__( else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -49,25 +50,38 @@ def measure(self, test_case: LLMTestCase) -> float: with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - self.verdicts: List[ContextualPrecisionVerdict] = ( - self._generate_verdicts( - test_case.input, - test_case.expected_output, - test_case.retrieval_context, - ) - ) - contextual_precision_score = self._generate_score() - self.reason = self._generate_reason( - test_case.input, contextual_precision_score - ) + if self.run_async: + loop = get_or_create_event_loop() + self.verdicts: List[ContextualPrecisionVerdict] = ( + loop.run_until_complete( + self._a_generate_verdicts( + test_case.input, + test_case.expected_output, + test_case.retrieval_context, + ) + ) + ) + self.score = self._generate_score() + self.reason = loop.run_until_complete( + self._a_generate_reason(test_case.input) + ) + else: + self.verdicts: List[ContextualPrecisionVerdict] = ( + self._generate_verdicts( + test_case.input, + test_case.expected_output, + test_case.retrieval_context, + ) + ) + self.score = self._generate_score() + self.reason = self._generate_reason(test_case.input) - self.success = contextual_precision_score >= self.threshold - self.score = contextual_precision_score + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, input: str, score: float): + async def _a_generate_reason(self, input: str): if self.include_reason is False: return None @@ -83,12 +97,20 @@ def _generate_reason(self, input: str, score: float): # for example, i can still have a perfect score with [1 1 0 0], # which then GPT will need the entire context to justify why the score is so high verdicts=retrieval_contexts_verdicts, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res + def _generate_reason(self, input: str): + loop = get_or_create_event_loop() + loop.run_until_complete(self._a_generate_reason(input)) + def _generate_score(self): number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: @@ -119,7 +141,7 @@ def _generate_score(self): return 0 if self.strict_mode and score < self.threshold else score - def _generate_verdicts( + async def _a_generate_verdicts( self, input: str, expected_output: str, retrieval_context: List[str] ) -> List[ContextualPrecisionVerdict]: prompt = ContextualPrecisionTemplate.generate_verdicts( @@ -128,14 +150,25 @@ def _generate_verdicts( retrieval_context=retrieval_context, ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) verdicts = [ ContextualPrecisionVerdict(**item) for item in data["verdicts"] ] - return verdicts + def _generate_verdicts( + self, input: str, expected_output: str, retrieval_context: List[str] + ) -> List[ContextualPrecisionVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_verdicts(input, expected_output, retrieval_context) + ) + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 6c190773f..440987241 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -1,8 +1,7 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field -import json -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -22,6 +21,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -31,6 +31,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -46,24 +47,34 @@ def measure(self, test_case: LLMTestCase) -> float: with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - self.verdicts: List[ContextualRecallVerdict] = ( - self._generate_verdicts( - test_case.expected_output, test_case.retrieval_context + if self.run_async: + loop = get_or_create_event_loop() + self.verdicts: List[ContextualRecallVerdict] = ( + loop.run_until_complete( + self._a_generate_verdicts( + test_case.expected_output, + test_case.retrieval_context, + ) + ) ) - ) - - contextual_recall_score = self._generate_score() - - self.reason = self._generate_reason( - test_case.expected_output, contextual_recall_score - ) + self.score = self._generate_score() + self.reason = loop.run_until_complete( + self._a_generate_reason(test_case.expected_output) + ) + else: + self.verdicts: List[ContextualRecallVerdict] = ( + self._generate_verdicts( + test_case.expected_output, test_case.retrieval_context + ) + ) + self.score = self._generate_score() + self.reason = self._generate_reason(test_case.expected_output) - self.success = contextual_recall_score >= self.threshold - self.score = contextual_recall_score + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, expected_output: str, score: float): + async def _a_generate_reason(self, expected_output: str): if self.include_reason is False: return None @@ -79,12 +90,20 @@ def _generate_reason(self, expected_output: str, score: float): expected_output=expected_output, supportive_reasons=supportive_reasons, unsupportive_reasons=unsupportive_reasons, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res + def _generate_reason(self, expected_output: str): + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason(expected_output)) + def _generate_score(self): number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: @@ -99,20 +118,31 @@ def _generate_score(self): return 0 if self.strict_mode and score < self.threshold else score - def _generate_verdicts( + async def _a_generate_verdicts( self, expected_output: str, retrieval_context: List[str] ) -> List[ContextualRecallVerdict]: prompt = ContextualRecallTemplate.generate_verdicts( expected_output=expected_output, retrieval_context=retrieval_context ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) verdicts = [ ContextualRecallVerdict(**item) for item in data["verdicts"] ] - return verdicts + def _generate_verdicts( + self, expected_output: str, retrieval_context: List[str] + ) -> List[ContextualRecallVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_verdicts(expected_output, retrieval_context) + ) + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 16303e22e..f7cfab79a 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -1,3 +1,4 @@ +import asyncio from typing import Optional, Union, List from threading import Lock from concurrent.futures import ThreadPoolExecutor, as_completed @@ -5,7 +6,7 @@ from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.metrics.hallucination.template import HallucinationTemplate from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.progress_context import metrics_progress_context @@ -23,7 +24,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - multithreading: bool = True, + run_async: bool = False, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -33,7 +34,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.multithreading = multithreading + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -46,17 +47,32 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - self.verdicts: List[HallucinationVerdict] = self._generate_verdicts( - test_case.actual_output, test_case.context - ) - hallucination_score = self._generate_score() - self.reason = self._generate_reason(hallucination_score) - self.success = hallucination_score <= self.threshold - self.score = hallucination_score + if self.run_async: + loop = get_or_create_event_loop() + self.verdicts: List[HallucinationVerdict] = ( + loop.run_until_complete( + self._a_generate_verdicts( + test_case.actual_output, test_case.context + ) + ) + ) + self.score = self._generate_score() + self.reason = loop.run_until_complete(self._a_generate_reason()) + + else: + self.verdicts: List[HallucinationVerdict] = ( + self._generate_verdicts( + test_case.actual_output, test_case.context + ) + ) + self.score = self._generate_score() + self.reason = self._generate_reason() + + self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, score): + async def _a_generate_reason(self): if self.include_reason is False: return None @@ -71,12 +87,20 @@ def _generate_reason(self, score): prompt: dict = HallucinationTemplate.generate_reason( factual_alignments=factual_alignments, contradictions=contradictions, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res + def _generate_reason(self): + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason()) + def _generate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: @@ -92,32 +116,23 @@ def _generate_score(self) -> float: return 1 if self.strict_mode and score > self.threshold else score - def _generate_verdicts( + async def _a_generate_verdicts( self, actual_output: str, contexts: List[str] ) -> List[HallucinationVerdict]: verdicts: List[HallucinationVerdict] = [] - if self.multithreading: - lock = Lock() - with ThreadPoolExecutor() as executor: - futures = { - executor.submit( - self._generate_verdict, - actual_output, - context, - verdicts, - lock, - ): context - for context in contexts - } - - for future in as_completed(futures): - future.result() + if self.run_async: + tasks = [ + self._a_generate_verdict(actual_output, context) + for context in contexts + ] + results = await asyncio.gather(*tasks) + verdicts.extend(results) else: prompt = HallucinationTemplate.generate_verdicts( actual_output=actual_output, contexts=contexts ) - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) verdicts = [ HallucinationVerdict(**item) for item in data["verdicts"] @@ -125,30 +140,46 @@ def _generate_verdicts( return verdicts - def _generate_verdict( - self, - actual_output: str, - context: str, - verdicts: List[HallucinationVerdict], - lock: Lock, + def _generate_verdicts( + self, actual_output: str, contexts: List[str] + ) -> List[HallucinationVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_verdicts(actual_output, contexts) + ) + + async def _a_generate_verdict( + self, actual_output: str, context: str ) -> HallucinationVerdict: + print("generating verdict") ####################################### ### Generate verdicts for [context] ### ####################################### prompt = HallucinationTemplate.generate_verdicts( actual_output=actual_output, contexts=[context] ) - res = self.model(prompt) - data = trimAndLoadJson(res) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + + data = trimAndLoadJson(res) # verdicts length will always be 1 final_verdicts = [ HallucinationVerdict(**item) for item in data["verdicts"] ] - with lock: - for final_verdict in final_verdicts: - verdicts.append(final_verdict) + return final_verdicts[0] + + def _generate_verdict( + self, actual_output: str, context: str + ) -> HallucinationVerdict: + print("generating verdict") + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_verdict(actual_output, context) + ) def is_successful(self) -> bool: self.success = self.score <= self.threshold diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 0d5302afe..9045a600a 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -1,3 +1,4 @@ +import asyncio from typing import List, Optional, Union from enum import Enum from pydantic import BaseModel, Field @@ -6,7 +7,7 @@ from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.metrics.summarization.template import SummarizationTemplate from deepeval.metrics.faithfulness.template import FaithfulnessTemplate from deepeval.progress_context import metrics_progress_context @@ -38,7 +39,7 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, assessment_questions: Optional[List[str]] = None, include_reason: bool = True, - multithreading=True, + run_async=True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -53,7 +54,7 @@ def __init__( else: self.assessment_questions = assessment_questions - self.multithreading = multithreading + self.run_async = run_async self.include_reason = include_reason self.n = n self.strict_mode = strict_mode @@ -65,31 +66,33 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - if test_case.input is None or test_case.actual_output is None: - raise ValueError("Input and actual output cannot be None") - - if self.multithreading: - # Use multithreading to generate truths and claims in parallel - with ThreadPoolExecutor() as executor: - future_truths = executor.submit( - # Claims made in the original text === truths - self._generate_claims, - test_case.input, + if self.run_async: + loop = get_or_create_event_loop() + self.truths, self.claims = loop.run_until_complete( + asyncio.gather( + self._a_generate_claims(test_case.input), + self._a_generate_claims(test_case.actual_output), ) - future_claims = executor.submit( - self._generate_claims, test_case.actual_output - ) - future_coverage_verdicts = executor.submit( - self._generate_coverage_verdicts, test_case + ) + self.coverage_verdicts, self.alignment_verdicts = ( + loop.run_until_complete( + asyncio.gather( + self._a_generate_coverage_verdicts(test_case), + self._a_generate_alignment_verdicts(), + ) ) + ) + alignment_score = self._generate_score(ScoreType.ALIGNMENT) + coverage_score = self._generate_score(ScoreType.COVERAGE) + + self.score_breakdown = { + ScoreType.ALIGNMENT.value: alignment_score, + ScoreType.COVERAGE.value: coverage_score, + } + self.score = min(alignment_score, coverage_score) + self.reason = loop.run_until_complete(self._a_generate_reason()) - self.truths: List[str] = future_truths.result() - self.claims: List[str] = future_claims.result() - self.coverage_verdicts: List[ - SummarizationCoverageVerdict - ] = future_coverage_verdicts.result() else: - # Sequential execution self.truths: List[str] = self._generate_claims(test_case.input) self.claims: List[str] = self._generate_claims( test_case.actual_output @@ -97,25 +100,24 @@ def measure(self, test_case: LLMTestCase): self.coverage_verdicts: List[SummarizationCoverageVerdict] = ( self._generate_coverage_verdicts(test_case) ) + self.alignment_verdicts: List[SummarizationAlignmentVerdict] = ( + self._generate_alignment_verdicts() + ) + alignment_score = self._generate_score(ScoreType.ALIGNMENT) + coverage_score = self._generate_score(ScoreType.COVERAGE) - self.alignment_verdicts: List[SummarizationAlignmentVerdict] = ( - self._generate_alignment_verdicts() - ) - alignment_score = self._generate_score(ScoreType.ALIGNMENT) - coverage_score = self._generate_score(ScoreType.COVERAGE) - - self.score_breakdown = { - ScoreType.ALIGNMENT.value: alignment_score, - ScoreType.COVERAGE.value: coverage_score, - } - summarization_score = min(alignment_score, coverage_score) - self.reason = self._generate_reason(summarization_score) - self.success = summarization_score >= self.threshold - self.score = summarization_score + self.score_breakdown = { + ScoreType.ALIGNMENT.value: alignment_score, + ScoreType.COVERAGE.value: coverage_score, + } + self.score = min(alignment_score, coverage_score) + self.reason = self._generate_reason() + + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, score: float) -> str: + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -140,7 +142,7 @@ def _generate_reason(self, score: float) -> str: contradictions=contradictions, redundancies=redundancies, questions=questions, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) if len(questions) > 0: @@ -150,9 +152,17 @@ def _generate_reason(self, score: float) -> str: """ prompt += """Reason:""" - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res + def _generate_reason(self) -> str: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason) + def _generate_score(self, score_type: ScoreType) -> float: if score_type == ScoreType.ALIGNMENT: total = len(self.alignment_verdicts) @@ -185,39 +195,60 @@ def _generate_score(self, score_type: ScoreType) -> float: return 0 if self.strict_mode and score < self.threshold else score - def _generate_answers(self, text: str) -> List[str]: + async def _a_generate_answers(self, text: str) -> List[str]: prompt = SummarizationTemplate.generate_answers( questions=self.assessment_questions, text=text ) - res = self.model(prompt) + + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) return data["answers"] - def _generate_assessment_questions(self, text: str): + def _generate_answers(self, text: str) -> List[str]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_answers(text)) + + async def _a_generate_assessment_questions(self, text: str): prompt = SummarizationTemplate.generate_questions(text=text, n=self.n) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) return data["questions"] - def _generate_coverage_verdicts( + def _generate_assessment_questions(self, text: str): + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_assessment_questions(text) + ) + + async def _a_generate_coverage_verdicts( self, test_case: LLMTestCase ) -> List[SummarizationCoverageVerdict]: if self.assessment_questions is None: - self.assessment_questions = self._generate_assessment_questions( - test_case.input - ) - - if self.multithreading: - with ThreadPoolExecutor() as executor: - future_original_answers: List[str] = executor.submit( - self._generate_answers, test_case.input + if self.run_async: + self.assessment_questions = ( + await self._a_generate_assessment_questions(test_case.input) ) - future_summary_answers: List[str] = executor.submit( - self._generate_answers, test_case.actual_output + else: + self.assessment_questions = self._generate_assessment_questions( + test_case.input ) - original_answers = future_original_answers.result() - summary_answers = future_summary_answers.result() + if self.run_async: + tasks = [ + self._a_generate_answers(test_case.input), + self._a_generate_answers(test_case.actual_output), + ] + results = await asyncio.gather(*tasks) + original_answers = results[0] + summary_answers = results[1] else: original_answers = self._generate_answers(test_case.input) summary_answers = self._generate_answers(test_case.actual_output) @@ -237,29 +268,55 @@ def _generate_coverage_verdicts( return coverage_veridcts - def _generate_alignment_verdicts( + def _generate_coverage_verdicts( + self, test_case: LLMTestCase + ) -> List[SummarizationCoverageVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete( + self._a_generate_coverage_verdicts(test_case) + ) + + async def _a_generate_alignment_verdicts( self, ) -> List[SummarizationAlignmentVerdict]: verdicts: List[SummarizationAlignmentVerdict] = [] prompt = SummarizationTemplate.generate_alignment_verdicts( summary_claims=self.claims, orignal_text="\n\n".join(self.truths) ) - res = self.model(prompt) + + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + data = trimAndLoadJson(res) verdicts = [ SummarizationAlignmentVerdict(**item) for item in data["verdicts"] ] - return verdicts - def _generate_claims(self, text: str) -> List[str]: + def _generate_alignment_verdicts( + self, + ) -> List[SummarizationAlignmentVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_alignment_verdicts()) + + async def _a_generate_claims(self, text: str) -> List[str]: # Borrow faithfulness template prompt = FaithfulnessTemplate.generate_claims(text=text) - res = self.model(prompt) - data = trimAndLoadJson(res) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + + data = trimAndLoadJson(res) return data["claims"] + def _generate_claims(self, text: str) -> List[str]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_claims(text)) + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 80ec950d6..1e783ce9f 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -6,7 +6,7 @@ from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.metrics.bias.template import BiasTemplate from deepeval.metrics.toxicity.template import ToxicityTemplate @@ -23,6 +23,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -32,6 +33,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -41,20 +43,31 @@ def measure(self, test_case: LLMTestCase): with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): - self.opinions: List[str] = self._generate_opinions( - test_case.actual_output - ) - - self.verdicts: List[ToxicityVerdict] = self._generate_verdicts() - toxicity_score = self._generate_score() - self.reason = self._generate_reason(toxicity_score) - self.success = toxicity_score <= self.threshold - self.score = toxicity_score - + if self.run_async: + loop = get_or_create_event_loop() + self.opinions: List[str] = loop.run_until_complete( + self._a_generate_opinions(test_case.actual_output) + ) + self.verdicts: List[ToxicityVerdict] = loop.run_until_complete( + self._a_generate_verdicts() + ) + self.score = self._generate_score() + self.reason = loop.run_until_complete(self._a_generate_reason()) + else: + self.opinions: List[str] = self._generate_opinions( + test_case.actual_output + ) + + self.verdicts: List[ToxicityVerdict] = self._generate_verdicts() + self.score = self._generate_score() + self.reason = self._generate_reason() + + self.success = self.score <= self.threshold + self.score = self.score capture_metric_type(self.__name__) return self.score - def _generate_reason(self, score) -> str: + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -65,12 +78,20 @@ def _generate_reason(self, score) -> str: prompt: dict = ToxicityTemplate.generate_reason( toxics=toxics, - score=format(score, ".2f"), + score=format(self.score, ".2f"), ) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) + return res + def _generate_reason(self) -> str: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_reason()) + def _generate_score(self) -> float: total = len(self.verdicts) if total == 0: @@ -85,24 +106,38 @@ def _generate_score(self) -> float: return 1 if self.strict_mode and score > self.threshold else score - def _generate_verdicts(self) -> List[ToxicityVerdict]: + async def _a_generate_verdicts(self) -> List[ToxicityVerdict]: verdicts: List[ToxicityVerdict] = [] prompt = ToxicityTemplate.generate_verdicts(opinions=self.opinions) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) data = trimAndLoadJson(res) verdicts = [ToxicityVerdict(**item) for item in data["verdicts"]] return verdicts - def _generate_opinions(self, actual_output: str) -> List[str]: + def _generate_verdicts(self) -> List[ToxicityVerdict]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_verdicts()) + + async def _a_generate_opinions(self, actual_output: str) -> List[str]: prompt = BiasTemplate.generate_opinions(actual_output=actual_output) - res = self.model(prompt) + if self.run_async: + res = await self.model.a_generate(prompt) + else: + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["opinions"] + def _generate_opinions(self, actual_output: str) -> List[str]: + loop = get_or_create_event_loop() + return loop.run_until_complete(self._a_generate_opinions()) + def is_successful(self) -> bool: return self.success diff --git a/tests/test_contextual_precision.py b/tests/test_contextual_precision.py index 0916f167a..5efed5073 100644 --- a/tests/test_contextual_precision.py +++ b/tests/test_contextual_precision.py @@ -59,7 +59,7 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_contextual_precision(): metric = ContextualPrecisionMetric(threshold=0.5) test_case = LLMTestCase( diff --git a/tests/test_contextual_recall.py b/tests/test_contextual_recall.py index 3a5f7312b..c7a3dbc1a 100644 --- a/tests/test_contextual_recall.py +++ b/tests/test_contextual_recall.py @@ -38,7 +38,7 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_contextual_recall(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index cb51af232..5b34b2c06 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -5,7 +5,7 @@ import deepeval -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric(): metric = HallucinationMetric(threshold=0.5) test_case = LLMTestCase( @@ -20,7 +20,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -33,13 +33,13 @@ def test_hallucination_metric_2(): assert_test(test_case, [metric]) -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( input="placeholder", actual_output="Python is a programming language.", - context=["Python is a snake."], + context=["Python is a snake.", "Pythons like to lurk in the forests."], cost=0.1, latency=13.0, ) diff --git a/tests/test_summarization.py b/tests/test_summarization.py index fa6799e86..86fce12f8 100644 --- a/tests/test_summarization.py +++ b/tests/test_summarization.py @@ -4,9 +4,9 @@ from deepeval.metrics import SummarizationMetric -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_summarization(): - metric = SummarizationMetric(multithreading=False) + metric = SummarizationMetric() input = """ In the rapidly evolving digital landscape, the proliferation of artificial intelligence (AI) technologies has been a game-changer in various industries, ranging from healthcare to finance. The integration of AI in these sectors has not only streamlined operations but also opened up new avenues for innovation and growth. In healthcare, AI algorithms are increasingly being used for diagnostic purposes, analyzing medical images, and providing personalized medicine solutions. This has significantly improved patient outcomes and has the potential to revolutionize healthcare delivery systems globally. For example, AI-driven tools can now detect anomalies in medical images with greater accuracy and speed than traditional methods, aiding in early diagnosis and treatment of diseases like cancer. diff --git a/tests/test_toxic.py b/tests/test_toxic.py index ac6cb84a4..eb0fdf848 100644 --- a/tests/test_toxic.py +++ b/tests/test_toxic.py @@ -11,7 +11,7 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", From 0d7bcfcf91d651887803de37d017052add31e8fb Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 03:20:00 +0800 Subject: [PATCH 30/59] async WIP --- deepeval/evaluate.py | 4 + .../answer_relevancy/answer_relevancy.py | 107 +++++---- deepeval/metrics/bias/bias.py | 88 ++++---- .../contextual_precision.py | 78 ++++--- .../contextual_recall/contextual_recall.py | 72 +++--- .../contextual_relevancy.py | 4 +- deepeval/metrics/faithfulness/faithfulness.py | 138 +++++++----- .../metrics/hallucination/hallucination.py | 114 ++++------ .../knowledge_retention.py | 4 +- .../metrics/summarization/summarization.py | 211 ++++++++++-------- deepeval/metrics/toxicity/toxicity.py | 74 +++--- tests/test_answer_relevancy.py | 2 +- tests/test_bias.py | 2 +- tests/test_contextual_precision.py | 2 +- tests/test_contextual_recall.py | 2 +- tests/test_everything.py | 40 ++-- tests/test_faithfulness.py | 27 ++- 17 files changed, 525 insertions(+), 444 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index ba254a288..7ea8888f4 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -50,6 +50,7 @@ def execute_test( test_cases: List[LLMTestCase], metrics: List[BaseMetric], save_to_disk: bool = False, + run_async: bool = True, ) -> List[TestResult]: test_results: List[TestResult] = [] test_run_manager.save_to_disk = save_to_disk @@ -73,7 +74,10 @@ def execute_test( test_start_time = time.perf_counter() for metric in metrics: + + # Long blocking I/O process metric.measure(test_case) + metric_metadata = MetricsMetadata( metric=metric.__name__, score=metric.score, diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index cfb89c6cd..31ab8c368 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -1,3 +1,4 @@ +import asyncio from typing import Optional, List, Union from pydantic import BaseModel, Field @@ -42,33 +43,21 @@ def measure(self, test_case: LLMTestCase) -> float: ): if self.run_async: loop = get_or_create_event_loop() - self.statements: List[str] = loop.run_until_complete( - self._a_generate_statements(test_case.actual_output) - ) - self.verdicts: List[AnswerRelvancyVerdict] = ( - loop.run_until_complete( - self._a_generate_verdicts(test_case.input) - ) - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete( - self._a_generate_reason(test_case.input) - ) + loop.run_until_complete(self.a_measure(test_case)) else: - self.statements: List[str] = self._generate_statements( - test_case.actual_output + self.truths = self._generate_statements( + test_case.retrieval_context ) - self.verdicts: List[AnswerRelvancyVerdict] = ( - self._generate_verdicts(test_case.input) - ) - self.score = self._generate_score() - self.reason = self._generate_reason(test_case.input) + self.claims = self._generate_verdicts(test_case.actual_output) + self.verdicts = self._generate_verdicts() + self.score = self._calculate_score() + self.reason = self._generate_reason() self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_score(self): + def _calculate_score(self): number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 0 @@ -79,9 +68,21 @@ def _generate_score(self): relevant_count += 1 score = relevant_count / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score + ################################ + ###### Asynchronous logic ###### + ################################ + async def a_measure(self, test_case: LLMTestCase): + self.statements: List[str] = await self._a_generate_statements( + test_case.actual_output + ) + self.verdicts: List[AnswerRelvancyVerdict] = ( + await self._a_generate_verdicts(test_case.input) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.input) + async def _a_generate_reason(self, input: str) -> str: if self.include_reason is False: return None @@ -96,18 +97,9 @@ async def _a_generate_reason(self, input: str) -> str: input=input, score=format(self.score, ".2f"), ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res - def _generate_reason(self, input: str) -> str: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason(input)) - async def _a_generate_verdicts( self, input: str ) -> List[AnswerRelvancyVerdict]: @@ -116,19 +108,11 @@ async def _a_generate_verdicts( input=input, actual_output=self.statements, ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] return verdicts - def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_verdicts(input)) - async def _a_generate_statements( self, actual_output: str, @@ -138,21 +122,50 @@ async def _a_generate_statements( actual_output=actual_output, ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["statements"] + ############################### + ###### Synchronous logic ###### + ############################### + def _generate_reason(self, input: str) -> str: + if self.include_reason is False: + return None + + irrelevant_statements = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + irrelevant_statements.append(verdict.reason) + + prompt = AnswerRelevancyTemplate.generate_reason( + irrelevant_statements=irrelevant_statements, + input=input, + score=format(self.score, ".2f"), + ) + res = self.model.generate(prompt) + return res + + def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: + prompt = AnswerRelevancyTemplate.generate_verdicts( + input=input, + actual_output=self.statements, + ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] + return verdicts + def _generate_statements( self, actual_output: str, ) -> List[str]: - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_statements(actual_output) + prompt = AnswerRelevancyTemplate.generate_statements( + actual_output=actual_output, ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["statements"] def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index b864c581b..039bc1423 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -44,28 +44,40 @@ def measure(self, test_case: LLMTestCase): ): if self.run_async: loop = get_or_create_event_loop() - self.opinions: List[str] = loop.run_until_complete( - self._a_generate_opinions(test_case.actual_output) - ) - self.verdicts: List[BiasVerdict] = loop.run_until_complete( - self._a_generate_verdicts() - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete(self._a_generate_reason()) - + loop.run_until_complete(self._a_execute_measure(test_case)) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output ) - self.verdicts: List[BiasVerdict] = self._generate_verdicts() - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason() self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.opinions: List[str] = await self._a_generate_opinions( + test_case.actual_output + ) + self.verdicts: List[BiasVerdict] = await self._a_generate_verdicts() + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() + + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + bias_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "yes": + bias_count += 1 + + score = bias_count / number_of_verdicts + return 1 if self.strict_mode and score > self.threshold else score + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -79,60 +91,52 @@ async def _a_generate_reason(self) -> str: biases=biases, score=format(self.score, ".2f"), ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) return res def _generate_reason(self) -> str: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason()) - - def _generate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 + if self.include_reason is False: + return None - bias_count = 0 + biases = [] for verdict in self.verdicts: if verdict.verdict.strip().lower() == "yes": - bias_count += 1 + biases.append(verdict.reason) - score = bias_count / number_of_verdicts - return 1 if self.strict_mode and score > self.threshold else score + prompt: dict = BiasTemplate.generate_reason( + biases=biases, + score=format(self.score, ".2f"), + ) + res = self.model.generate(prompt) + return res async def _a_generate_verdicts(self) -> List[BiasVerdict]: verdicts: List[BiasVerdict] = [] - prompt = BiasTemplate.generate_verdicts(opinions=self.opinions) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [BiasVerdict(**item) for item in data["verdicts"]] return verdicts def _generate_verdicts(self) -> List[BiasVerdict]: - loop = get_or_create_event_loop() - loop.run_until_complete(self._a_generate_verdicts()) + verdicts: List[BiasVerdict] = [] + prompt = BiasTemplate.generate_verdicts(opinions=self.opinions) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [BiasVerdict(**item) for item in data["verdicts"]] + return verdicts async def _a_generate_opinions(self, actual_output: str) -> List[str]: prompt = BiasTemplate.generate_opinions(actual_output=actual_output) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["opinions"] def _generate_opinions(self, actual_output: str) -> List[str]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_opinions(actual_output)) + prompt = BiasTemplate.generate_opinions(actual_output=actual_output) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["opinions"] def is_successful(self) -> bool: return self.success diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 0ff74d698..0c2e507fa 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -53,19 +53,7 @@ def measure(self, test_case: LLMTestCase) -> float: if self.run_async: loop = get_or_create_event_loop() - self.verdicts: List[ContextualPrecisionVerdict] = ( - loop.run_until_complete( - self._a_generate_verdicts( - test_case.input, - test_case.expected_output, - test_case.retrieval_context, - ) - ) - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete( - self._a_generate_reason(test_case.input) - ) + loop.run_until_complete(self.a_measure(test_case)) else: self.verdicts: List[ContextualPrecisionVerdict] = ( self._generate_verdicts( @@ -74,13 +62,25 @@ def measure(self, test_case: LLMTestCase) -> float: test_case.retrieval_context, ) ) - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason(test_case.input) self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.verdicts: List[ContextualPrecisionVerdict] = ( + await self._a_generate_verdicts( + test_case.input, + test_case.expected_output, + test_case.retrieval_context, + ) + ) + + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.input) + async def _a_generate_reason(self, input: str): if self.include_reason is False: return None @@ -89,29 +89,33 @@ async def _a_generate_reason(self, input: str): {"verdict": verdict.verdict, "reasons": verdict.reason} for verdict in self.verdicts ] - prompt = ContextualPrecisionTemplate.generate_reason( input=input, - # Need to pass in entire verdict because the reason has to take into account - # not just the relevant chunks, but the bad chunks. - # for example, i can still have a perfect score with [1 1 0 0], - # which then GPT will need the entire context to justify why the score is so high verdicts=retrieval_contexts_verdicts, score=format(self.score, ".2f"), ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res def _generate_reason(self, input: str): - loop = get_or_create_event_loop() - loop.run_until_complete(self._a_generate_reason(input)) + if self.include_reason is False: + return None - def _generate_score(self): + retrieval_contexts_verdicts = [ + {"verdict": verdict.verdict, "reasons": verdict.reason} + for verdict in self.verdicts + ] + prompt = ContextualPrecisionTemplate.generate_reason( + input=input, + verdicts=retrieval_contexts_verdicts, + score=format(self.score, ".2f"), + ) + + res = self.model.generate(prompt) + return res + + def _calculate_score(self): number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 0 @@ -149,12 +153,7 @@ async def _a_generate_verdicts( expected_output=expected_output, retrieval_context=retrieval_context, ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [ ContextualPrecisionVerdict(**item) for item in data["verdicts"] @@ -164,10 +163,17 @@ async def _a_generate_verdicts( def _generate_verdicts( self, input: str, expected_output: str, retrieval_context: List[str] ) -> List[ContextualPrecisionVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_verdicts(input, expected_output, retrieval_context) + prompt = ContextualPrecisionTemplate.generate_verdicts( + input=input, + expected_output=expected_output, + retrieval_context=retrieval_context, ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [ + ContextualPrecisionVerdict(**item) for item in data["verdicts"] + ] + return verdicts def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 440987241..19c87e7ed 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -49,31 +49,29 @@ def measure(self, test_case: LLMTestCase) -> float: ): if self.run_async: loop = get_or_create_event_loop() - self.verdicts: List[ContextualRecallVerdict] = ( - loop.run_until_complete( - self._a_generate_verdicts( - test_case.expected_output, - test_case.retrieval_context, - ) - ) - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete( - self._a_generate_reason(test_case.expected_output) - ) + loop.run_until_complete(self.a_measure(test_case)) else: self.verdicts: List[ContextualRecallVerdict] = ( self._generate_verdicts( test_case.expected_output, test_case.retrieval_context ) ) - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason(test_case.expected_output) self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.verdicts: List[ContextualRecallVerdict] = ( + await self._a_generate_verdicts( + test_case.expected_output, test_case.retrieval_context + ) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.expected_output) + async def _a_generate_reason(self, expected_output: str): if self.include_reason is False: return None @@ -93,18 +91,32 @@ async def _a_generate_reason(self, expected_output: str): score=format(self.score, ".2f"), ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res def _generate_reason(self, expected_output: str): - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason(expected_output)) + if self.include_reason is False: + return None + + supportive_reasons = [] + unsupportive_reasons = [] + for verdict in self.verdicts: + if verdict.verdict.lower() == "yes": + supportive_reasons.append(verdict.reason) + else: + unsupportive_reasons.append(verdict.reason) - def _generate_score(self): + prompt = ContextualRecallTemplate.generate_reason( + expected_output=expected_output, + supportive_reasons=supportive_reasons, + unsupportive_reasons=unsupportive_reasons, + score=format(self.score, ".2f"), + ) + + res = self.model.generate(prompt) + return res + + def _calculate_score(self): number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 0 @@ -115,7 +127,6 @@ def _generate_score(self): justified_sentences += 1 score = justified_sentences / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score async def _a_generate_verdicts( @@ -124,11 +135,7 @@ async def _a_generate_verdicts( prompt = ContextualRecallTemplate.generate_verdicts( expected_output=expected_output, retrieval_context=retrieval_context ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [ ContextualRecallVerdict(**item) for item in data["verdicts"] @@ -138,10 +145,15 @@ async def _a_generate_verdicts( def _generate_verdicts( self, expected_output: str, retrieval_context: List[str] ) -> List[ContextualRecallVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_verdicts(expected_output, retrieval_context) + prompt = ContextualRecallTemplate.generate_verdicts( + expected_output=expected_output, retrieval_context=retrieval_context ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [ + ContextualRecallVerdict(**item) for item in data["verdicts"] + ] + return verdicts def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 58da7aede..58fa3a486 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -55,7 +55,7 @@ def measure(self, test_case: LLMTestCase) -> float: test_case.input, test_case.retrieval_context ) ) - contextual_recall_score = self._generate_score() + contextual_recall_score = self._calculate_score() self.reason = self._generate_reason( test_case.input, contextual_recall_score @@ -86,7 +86,7 @@ def _generate_reason(self, input: str, score: float): res = self.model(prompt) return res - def _generate_score(self): + def _calculate_score(self): irrelevant_sentences = 0 total_sentence_count = 0 for verdicts in self.verdicts_list: diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 1fcc94117..3f10773b5 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -44,33 +44,51 @@ def measure(self, test_case: LLMTestCase): raise ValueError( "Input, actual output, and retrieval context cannot be None" ) + with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode ): if self.run_async: loop = get_or_create_event_loop() - self.truths, self.claims = loop.run_until_complete( - asyncio.gather( - self._a_generate_truths(test_case.retrieval_context), - self._a_generate_claims(test_case.actual_output), - ) - ) - self.verdicts = loop.run_until_complete( - self._a_generate_verdicts() - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete(self._a_generate_reason()) + loop.run_until_complete(self.a_measure(test_case)) else: self.truths = self._generate_truths(test_case.retrieval_context) self.claims = self._generate_claims(test_case.actual_output) self.verdicts = self._generate_verdicts() - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason() self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + faithfulness_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + faithfulness_count += 1 + + score = faithfulness_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + + ################################ + ###### Asynchronous logic ###### + ################################ + async def a_measure(self, test_case: LLMTestCase): + self.truths, self.claims = await asyncio.gather( + self._a_generate_truths(test_case.retrieval_context), + self._a_generate_claims(test_case.actual_output), + ) + print("generate verdicts") + self.verdicts = await self._a_generate_verdicts() + self.score = self._calculate_score() + print("generate reasons") + self.reason = await self._a_generate_reason() + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -84,82 +102,80 @@ async def _a_generate_reason(self) -> str: contradictions=contradictions, score=format(self.score, ".2f"), ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) return res - def _generate_reason(self) -> str: - loop = asyncio.get_event_loop() - return loop.run_until_complete(self._a_generate_reason()) - - def _generate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - faithfulness_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() != "no": - faithfulness_count += 1 - - score = faithfulness_count / number_of_verdicts - - return 0 if self.strict_mode and score < self.threshold else score - async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: verdicts: List[FaithfulnessVerdict] = [] prompt = FaithfulnessTemplate.generate_verdicts( claims=self.claims, retrieval_context="\n\n".join(self.truths) ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] - return verdicts - def _generate_verdicts(self) -> List[FaithfulnessVerdict]: - loop = asyncio.get_event_loop() - return loop.run_until_complete(self._a_generate_verdicts()) - async def _a_generate_truths(self, retrieval_context: str) -> List[str]: print("generating truths") prompt = FaithfulnessTemplate.generate_claims( text="\n\n".join(retrieval_context) ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) - return data["claims"] - def _generate_truths(self, retrieval_context: str) -> List[str]: - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self._a_generate_truths(retrieval_context) - ) - async def _a_generate_claims(self, actual_output: str) -> List[str]: print("generating claims") prompt = FaithfulnessTemplate.generate_claims(text=actual_output) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + return data["claims"] + + ############################### + ###### Synchronous logic ###### + ############################### + def _generate_reason(self) -> str: + if self.include_reason is False: + return None + + contradictions = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + contradictions.append(verdict.reason) + + prompt: dict = FaithfulnessTemplate.generate_reason( + contradictions=contradictions, + score=format(self.score, ".2f"), + ) + res = self.model.generate(prompt) + return res + + def _generate_verdicts(self) -> List[FaithfulnessVerdict]: + verdicts: List[FaithfulnessVerdict] = [] + + prompt = FaithfulnessTemplate.generate_verdicts( + claims=self.claims, retrieval_context="\n\n".join(self.truths) + ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] + return verdicts + + def _generate_truths(self, retrieval_context: str) -> List[str]: + print("generating truths") + prompt = FaithfulnessTemplate.generate_claims( + text="\n\n".join(retrieval_context) + ) + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["claims"] def _generate_claims(self, actual_output: str) -> List[str]: - loop = asyncio.get_event_loop() - return loop.run_until_complete(self._a_generate_claims(actual_output)) + prompt = FaithfulnessTemplate.generate_claims(text=actual_output) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["claims"] def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index f7cfab79a..2344813f3 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -49,29 +49,29 @@ def measure(self, test_case: LLMTestCase): ): if self.run_async: loop = get_or_create_event_loop() - self.verdicts: List[HallucinationVerdict] = ( - loop.run_until_complete( - self._a_generate_verdicts( - test_case.actual_output, test_case.context - ) - ) - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete(self._a_generate_reason()) - + loop.run_until_complete(self.a_measure(test_case)) else: self.verdicts: List[HallucinationVerdict] = ( self._generate_verdicts( test_case.actual_output, test_case.context ) ) - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason() self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.verdicts: List[HallucinationVerdict] = ( + await self._a_generate_verdicts( + test_case.actual_output, test_case.context + ) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() + async def _a_generate_reason(self): if self.include_reason is False: return None @@ -90,18 +90,31 @@ async def _a_generate_reason(self): score=format(self.score, ".2f"), ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res def _generate_reason(self): - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason()) + if self.include_reason is False: + return None + + factual_alignments = [] + contradictions = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + factual_alignments.append(verdict.reason) + else: + contradictions.append(verdict.reason) + + prompt: dict = HallucinationTemplate.generate_reason( + factual_alignments=factual_alignments, + contradictions=contradictions, + score=format(self.score, ".2f"), + ) + + res = self.model.generate(prompt) + return res - def _generate_score(self) -> float: + def _calculate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 0 @@ -120,66 +133,25 @@ async def _a_generate_verdicts( self, actual_output: str, contexts: List[str] ) -> List[HallucinationVerdict]: verdicts: List[HallucinationVerdict] = [] - - if self.run_async: - tasks = [ - self._a_generate_verdict(actual_output, context) - for context in contexts - ] - results = await asyncio.gather(*tasks) - verdicts.extend(results) - else: - prompt = HallucinationTemplate.generate_verdicts( - actual_output=actual_output, contexts=contexts - ) - res = self.model.generate(prompt) - data = trimAndLoadJson(res) - verdicts = [ - HallucinationVerdict(**item) for item in data["verdicts"] - ] - + prompt = HallucinationTemplate.generate_verdicts( + actual_output=actual_output, contexts=contexts + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + verdicts = [HallucinationVerdict(**item) for item in data["verdicts"]] return verdicts def _generate_verdicts( self, actual_output: str, contexts: List[str] ) -> List[HallucinationVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_verdicts(actual_output, contexts) - ) - - async def _a_generate_verdict( - self, actual_output: str, context: str - ) -> HallucinationVerdict: - print("generating verdict") - ####################################### - ### Generate verdicts for [context] ### - ####################################### + verdicts: List[HallucinationVerdict] = [] prompt = HallucinationTemplate.generate_verdicts( - actual_output=actual_output, contexts=[context] + actual_output=actual_output, contexts=contexts ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = self.model.generate(prompt) data = trimAndLoadJson(res) - # verdicts length will always be 1 - final_verdicts = [ - HallucinationVerdict(**item) for item in data["verdicts"] - ] - - return final_verdicts[0] - - def _generate_verdict( - self, actual_output: str, context: str - ) -> HallucinationVerdict: - print("generating verdict") - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_verdict(actual_output, context) - ) + verdicts = [HallucinationVerdict(**item) for item in data["verdicts"]] + return verdicts def is_successful(self) -> bool: self.success = self.score <= self.threshold diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index 5de0aa2f4..5ea6ddc3d 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -53,7 +53,7 @@ def measure(self, test_case: ConversationalTestCase): self._generate_verdicts(test_case) ) - knowledge_retention_score = self._generate_score() + knowledge_retention_score = self._calculate_score() self.reason = self._generate_reason(knowledge_retention_score) self.success = knowledge_retention_score >= self.threshold @@ -78,7 +78,7 @@ def _generate_reason(self, score: float) -> str: res = self.model(prompt) return res - def _generate_score(self) -> float: + def _calculate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 0 diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 9045a600a..628e413fc 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -68,30 +68,7 @@ def measure(self, test_case: LLMTestCase): ): if self.run_async: loop = get_or_create_event_loop() - self.truths, self.claims = loop.run_until_complete( - asyncio.gather( - self._a_generate_claims(test_case.input), - self._a_generate_claims(test_case.actual_output), - ) - ) - self.coverage_verdicts, self.alignment_verdicts = ( - loop.run_until_complete( - asyncio.gather( - self._a_generate_coverage_verdicts(test_case), - self._a_generate_alignment_verdicts(), - ) - ) - ) - alignment_score = self._generate_score(ScoreType.ALIGNMENT) - coverage_score = self._generate_score(ScoreType.COVERAGE) - - self.score_breakdown = { - ScoreType.ALIGNMENT.value: alignment_score, - ScoreType.COVERAGE.value: coverage_score, - } - self.score = min(alignment_score, coverage_score) - self.reason = loop.run_until_complete(self._a_generate_reason()) - + loop.run_until_complete(self.a_measure(test_case)) else: self.truths: List[str] = self._generate_claims(test_case.input) self.claims: List[str] = self._generate_claims( @@ -103,9 +80,8 @@ def measure(self, test_case: LLMTestCase): self.alignment_verdicts: List[SummarizationAlignmentVerdict] = ( self._generate_alignment_verdicts() ) - alignment_score = self._generate_score(ScoreType.ALIGNMENT) - coverage_score = self._generate_score(ScoreType.COVERAGE) - + alignment_score = self._calculate_score(ScoreType.ALIGNMENT) + coverage_score = self._calculate_score(ScoreType.COVERAGE) self.score_breakdown = { ScoreType.ALIGNMENT.value: alignment_score, ScoreType.COVERAGE.value: coverage_score, @@ -117,6 +93,24 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.truths, self.claims = await asyncio.gather( + self._a_generate_claims(test_case.input), + self._a_generate_claims(test_case.actual_output), + ) + self.coverage_verdicts, self.alignment_verdicts = await asyncio.gather( + self._a_generate_coverage_verdicts(test_case), + self._a_generate_alignment_verdicts(), + ) + alignment_score = self._calculate_score(ScoreType.ALIGNMENT) + coverage_score = self._calculate_score(ScoreType.COVERAGE) + self.score_breakdown = { + ScoreType.ALIGNMENT.value: alignment_score, + ScoreType.COVERAGE.value: coverage_score, + } + self.score = min(alignment_score, coverage_score) + self.reason = await self._a_generate_reason() + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -152,18 +146,47 @@ async def _a_generate_reason(self) -> str: """ prompt += """Reason:""" - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res def _generate_reason(self) -> str: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason) + if self.include_reason is False: + return None + + contradictions = [] + redundancies = [] + for verdict in self.alignment_verdicts: + if verdict.verdict.strip().lower() == "no": + contradictions.append(verdict.reason) + elif verdict.verdict.strip().lower() == "idk": + redundancies.append(verdict.reason) - def _generate_score(self, score_type: ScoreType) -> float: + questions = [] + if self.coverage_verdicts: + for verdict in self.coverage_verdicts: + if ( + verdict.original_verdict.strip().lower() == "yes" + and verdict.summary_verdict.strip().lower() == "no" + ): + questions.append(verdict.question) + + prompt: dict = SummarizationTemplate.generate_reason( + contradictions=contradictions, + redundancies=redundancies, + questions=questions, + score=format(self.score, ".2f"), + ) + + if len(questions) > 0: + prompt += f"""Questions the original text can answer but not the summary: +{questions} + +""" + prompt += """Reason:""" + res = self.model.generate(prompt) + return res + + def _calculate_score(self, score_type: ScoreType) -> float: if score_type == ScoreType.ALIGNMENT: total = len(self.alignment_verdicts) if total == 0: @@ -199,59 +222,45 @@ async def _a_generate_answers(self, text: str) -> List[str]: prompt = SummarizationTemplate.generate_answers( questions=self.assessment_questions, text=text ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["answers"] def _generate_answers(self, text: str) -> List[str]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_answers(text)) + prompt = SummarizationTemplate.generate_answers( + questions=self.assessment_questions, text=text + ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["answers"] async def _a_generate_assessment_questions(self, text: str): prompt = SummarizationTemplate.generate_questions(text=text, n=self.n) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["questions"] def _generate_assessment_questions(self, text: str): - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_assessment_questions(text) - ) + prompt = SummarizationTemplate.generate_questions(text=text, n=self.n) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["questions"] async def _a_generate_coverage_verdicts( self, test_case: LLMTestCase ) -> List[SummarizationCoverageVerdict]: if self.assessment_questions is None: - if self.run_async: - self.assessment_questions = ( - await self._a_generate_assessment_questions(test_case.input) - ) - else: - self.assessment_questions = self._generate_assessment_questions( - test_case.input - ) + self.assessment_questions = ( + await self._a_generate_assessment_questions(test_case.input) + ) - if self.run_async: - tasks = [ - self._a_generate_answers(test_case.input), - self._a_generate_answers(test_case.actual_output), - ] - results = await asyncio.gather(*tasks) - original_answers = results[0] - summary_answers = results[1] - else: - original_answers = self._generate_answers(test_case.input) - summary_answers = self._generate_answers(test_case.actual_output) + tasks = [ + self._a_generate_answers(test_case.input), + self._a_generate_answers(test_case.actual_output), + ] + results = await asyncio.gather(*tasks) + original_answers = results[0] + summary_answers = results[1] if len(original_answers) != len(summary_answers): raise ValueError("Number of verdicts generated does not equal.") @@ -265,16 +274,33 @@ async def _a_generate_coverage_verdicts( question=self.assessment_questions[i], ) ) - return coverage_veridcts def _generate_coverage_verdicts( self, test_case: LLMTestCase ) -> List[SummarizationCoverageVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete( - self._a_generate_coverage_verdicts(test_case) - ) + if self.assessment_questions is None: + self.assessment_questions = self._generate_assessment_questions( + test_case.input + ) + + original_answers = self._generate_answers(test_case.input) + summary_answers = self._generate_answers(test_case.actual_output) + + if len(original_answers) != len(summary_answers): + raise ValueError("Number of verdicts generated does not equal.") + + coverage_veridcts: List[SummarizationCoverageVerdict] = [] + for i in range(len(original_answers)): + coverage_veridcts.append( + SummarizationCoverageVerdict( + summary_verdict=summary_answers[i], + original_verdict=original_answers[i], + question=self.assessment_questions[i], + ) + ) + + return coverage_veridcts async def _a_generate_alignment_verdicts( self, @@ -283,12 +309,7 @@ async def _a_generate_alignment_verdicts( prompt = SummarizationTemplate.generate_alignment_verdicts( summary_claims=self.claims, orignal_text="\n\n".join(self.truths) ) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) verdicts = [ SummarizationAlignmentVerdict(**item) for item in data["verdicts"] @@ -298,24 +319,30 @@ async def _a_generate_alignment_verdicts( def _generate_alignment_verdicts( self, ) -> List[SummarizationAlignmentVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_alignment_verdicts()) + verdicts: List[SummarizationAlignmentVerdict] = [] + prompt = SummarizationTemplate.generate_alignment_verdicts( + summary_claims=self.claims, orignal_text="\n\n".join(self.truths) + ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [ + SummarizationAlignmentVerdict(**item) for item in data["verdicts"] + ] + return verdicts async def _a_generate_claims(self, text: str) -> List[str]: # Borrow faithfulness template prompt = FaithfulnessTemplate.generate_claims(text=text) - - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["claims"] def _generate_claims(self, text: str) -> List[str]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_claims(text)) + # Borrow faithfulness template + prompt = FaithfulnessTemplate.generate_claims(text=text) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["claims"] def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 1e783ce9f..9d33509eb 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -45,21 +45,13 @@ def measure(self, test_case: LLMTestCase): ): if self.run_async: loop = get_or_create_event_loop() - self.opinions: List[str] = loop.run_until_complete( - self._a_generate_opinions(test_case.actual_output) - ) - self.verdicts: List[ToxicityVerdict] = loop.run_until_complete( - self._a_generate_verdicts() - ) - self.score = self._generate_score() - self.reason = loop.run_until_complete(self._a_generate_reason()) + loop.run_until_complete(self.a_measure(test_case)) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output ) - self.verdicts: List[ToxicityVerdict] = self._generate_verdicts() - self.score = self._generate_score() + self.score = self._calculate_score() self.reason = self._generate_reason() self.success = self.score <= self.threshold @@ -67,6 +59,14 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.opinions: List[str] = await self._a_generate_opinions( + test_case.actual_output + ) + self.verdicts: List[ToxicityVerdict] = await self._a_generate_verdicts() + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() + async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -81,18 +81,27 @@ async def _a_generate_reason(self) -> str: score=format(self.score, ".2f"), ) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) - + res = await self.model.a_generate(prompt) return res def _generate_reason(self) -> str: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_reason()) + if self.include_reason is False: + return None + + toxics = [] + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "yes": + toxics.append(verdict.reason) + + prompt: dict = ToxicityTemplate.generate_reason( + toxics=toxics, + score=format(self.score, ".2f"), + ) + + res = self.model.generate(prompt) + return res - def _generate_score(self) -> float: + def _calculate_score(self) -> float: total = len(self.verdicts) if total == 0: return 0 @@ -103,40 +112,35 @@ def _generate_score(self) -> float: toxic_count += 1 score = toxic_count / total - return 1 if self.strict_mode and score > self.threshold else score async def _a_generate_verdicts(self) -> List[ToxicityVerdict]: verdicts: List[ToxicityVerdict] = [] - prompt = ToxicityTemplate.generate_verdicts(opinions=self.opinions) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) - verdicts = [ToxicityVerdict(**item) for item in data["verdicts"]] - return verdicts def _generate_verdicts(self) -> List[ToxicityVerdict]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_verdicts()) + verdicts: List[ToxicityVerdict] = [] + prompt = ToxicityTemplate.generate_verdicts(opinions=self.opinions) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdicts = [ToxicityVerdict(**item) for item in data["verdicts"]] + return verdicts async def _a_generate_opinions(self, actual_output: str) -> List[str]: prompt = BiasTemplate.generate_opinions(actual_output=actual_output) - if self.run_async: - res = await self.model.a_generate(prompt) - else: - res = self.model.generate(prompt) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) - return data["opinions"] def _generate_opinions(self, actual_output: str) -> List[str]: - loop = get_or_create_event_loop() - return loop.run_until_complete(self._a_generate_opinions()) + prompt = BiasTemplate.generate_opinions(actual_output=actual_output) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + return data["opinions"] def is_successful(self) -> bool: return self.success diff --git a/tests/test_answer_relevancy.py b/tests/test_answer_relevancy.py index 8c77bf970..f663c9b0c 100644 --- a/tests/test_answer_relevancy.py +++ b/tests/test_answer_relevancy.py @@ -46,7 +46,7 @@ """ -# @pytest.mark.skip(reason="openai is very expensive") +@pytest.mark.skip(reason="openai is very expensive") def test_answer_relevancy(): metric = AnswerRelevancyMetric(threshold=0.5) test_case = LLMTestCase( diff --git a/tests/test_bias.py b/tests/test_bias.py index ff6165498..71afbbada 100644 --- a/tests/test_bias.py +++ b/tests/test_bias.py @@ -30,7 +30,7 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", diff --git a/tests/test_contextual_precision.py b/tests/test_contextual_precision.py index 5efed5073..0916f167a 100644 --- a/tests/test_contextual_precision.py +++ b/tests/test_contextual_precision.py @@ -59,7 +59,7 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_contextual_precision(): metric = ContextualPrecisionMetric(threshold=0.5) test_case = LLMTestCase( diff --git a/tests/test_contextual_recall.py b/tests/test_contextual_recall.py index c7a3dbc1a..3a5f7312b 100644 --- a/tests/test_contextual_recall.py +++ b/tests/test_contextual_recall.py @@ -38,7 +38,7 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_contextual_recall(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", diff --git a/tests/test_everything.py b/tests/test_everything.py index 766531c14..22a7d5afc 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -69,7 +69,7 @@ being composed mostly of rock and metal. """ -strict_mode = True +strict_mode = False @pytest.mark.skip(reason="openai is expensive") @@ -78,20 +78,20 @@ def test_everything(): metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) - metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + # metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) metric6 = BiasMetric(threshold=0.5, strict_mode=strict_mode) metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) - metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) - metric10 = GEval( - name="Coherence", - criteria="Coherence - determine if the actual output is coherent with the input.", - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - ], - strict_mode=True, - ) + # metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) + # metric10 = GEval( + # name="Coherence", + # criteria="Coherence - determine if the actual output is coherent with the input.", + # evaluation_params=[ + # LLMTestCaseParams.INPUT, + # LLMTestCaseParams.ACTUAL_OUTPUT, + # ], + # strict_mode=True, + # ) test_case = LLMTestCase( input=question, @@ -103,15 +103,15 @@ def test_everything(): assert_test( test_case, [ - # metric1, - # metric2, - # metric3, - # metric4, + metric1, + metric2, + metric3, + metric4, # metric5, - # metric6, - # metric7, - # metric8, + metric6, + metric7, + metric8, # metric9, - metric10, + # metric10, ], ) diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index 81e10c716..3be4aed1c 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -1,3 +1,4 @@ +import asyncio import pytest from deepeval.test_case import LLMTestCase from deepeval.metrics import FaithfulnessMetric @@ -38,12 +39,34 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_faithfulness(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", actual_output=output, retrieval_context=[one, two, three], ) - metric = FaithfulnessMetric(run_async=True) + metric = FaithfulnessMetric(run_async=False) assert_test(test_case, [metric]) + + +# test_case = LLMTestCase( +# input="What is the primary difference between a comet and an asteroid?", +# actual_output=output, +# retrieval_context=[one, two, three], +# ) + + +# async def example(): +# metric1 = FaithfulnessMetric(run_async=True) +# metric2 = FaithfulnessMetric(run_async=True) +# metric3 = FaithfulnessMetric(run_async=True) +# a = await asyncio.gather( +# metric1._a_execute_measure(test_case), +# metric2._a_execute_measure(test_case), +# metric3._a_execute_measure(test_case), +# ) +# return a + + +# asyncio.run(example()) From 99b1cd1f49ac3715709ca1d7a51a5187e16abcc2 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 17:06:59 +0800 Subject: [PATCH 31/59] Async metrics --- deepeval/evaluate.py | 118 +++++++++-- .../integrations/hugging_face/callback.py | 4 +- .../answer_relevancy/answer_relevancy.py | 80 +++++--- deepeval/metrics/base_metric.py | 6 + deepeval/metrics/bias/bias.py | 44 ++-- .../contextual_precision.py | 49 +++-- .../contextual_recall/contextual_recall.py | 47 +++-- .../contextual_relevancy.py | 193 ++++++++++-------- .../metrics/contextual_relevancy/template.py | 37 ++-- deepeval/metrics/faithfulness/faithfulness.py | 83 ++++---- .../metrics/hallucination/hallucination.py | 45 ++-- .../knowledge_retention.py | 4 +- .../metrics/summarization/summarization.py | 65 +++--- deepeval/metrics/toxicity/toxicity.py | 47 +++-- deepeval/progress_context.py | 21 +- tests/test_everything.py | 61 +++++- tests/test_faithfulness.py | 24 +-- tests/test_hallucination.py | 6 +- tests/test_summarization.py | 2 +- tests/test_toxic.py | 2 +- 20 files changed, 598 insertions(+), 340 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 7ea8888f4..238985728 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -1,9 +1,10 @@ +import asyncio import os -from typing import List +from typing import List, Optional import time from dataclasses import dataclass -from deepeval.utils import drop_and_copy +from deepeval.utils import drop_and_copy, get_or_create_event_loop from deepeval.telemetry import capture_evaluation_count from deepeval.progress_context import progress_context from deepeval.metrics import BaseMetric @@ -46,11 +47,78 @@ def create_test_result( raise ValueError("TestCase not supported yet.") -def execute_test( +def create_api_test_case( + test_case: LLMTestCase, + index: Optional[int] = None, +) -> APITestCase: + return APITestCase( + name=os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}"), + input=test_case.input, + actualOutput=test_case.actual_output, + expectedOutput=test_case.expected_output, + success=True, + metricsMetadata=[], + runDuration=0, + latency=test_case.latency, + cost=test_case.cost, + context=test_case.context, + retrievalContext=test_case.retrieval_context, + traceStack=get_trace_stack(), + id=test_case.id, + ) + + +def execute_test_cases( + test_cases: List[LLMTestCase], + metrics: List[BaseMetric], + save_to_disk: bool = False, +) -> List[TestResult]: + test_results: List[TestResult] = [] + test_run_manager.save_to_disk = save_to_disk + for index, test_case in enumerate(test_cases): + success = True + api_test_case: APITestCase = create_api_test_case(test_case, index) + test_start_time = time.perf_counter() + for metric in metrics: + + # Long blocking I/O process + metric.measure(test_case) + + metric_metadata = MetricsMetadata( + metric=metric.__name__, + score=metric.score, + threshold=metric.threshold, + reason=metric.reason, + success=metric.is_successful(), + evaluationModel=metric.evaluation_model, + ) + api_test_case.metrics_metadata.append(metric_metadata) + + if metric_metadata.success is False: + success = False + + test_end_time = time.perf_counter() + run_duration = test_end_time - test_start_time + api_test_case.run_duration = run_duration + api_test_case.success = success + + test_run = test_run_manager.get_test_run() + test_run.test_cases.append(api_test_case) + test_run.dataset_alias = test_case.dataset_alias + test_run_manager.save_test_run() + + test_result = create_test_result( + test_case, success, drop_and_copy(metrics, ["model", "embeddings"]) + ) + test_results.append(test_result) + + return test_results + + +async def a_execute_test_cases( test_cases: List[LLMTestCase], metrics: List[BaseMetric], save_to_disk: bool = False, - run_async: bool = True, ) -> List[TestResult]: test_results: List[TestResult] = [] test_run_manager.save_to_disk = save_to_disk @@ -73,11 +141,13 @@ def execute_test( ) test_start_time = time.perf_counter() - for metric in metrics: - # Long blocking I/O process - metric.measure(test_case) + # Run metrics concurrently using asyncio.gather + await asyncio.gather( + *[metric.a_measure(test_case) for metric in metrics] + ) + for metric in metrics: metric_metadata = MetricsMetadata( metric=metric.__name__, score=metric.score, @@ -124,7 +194,7 @@ def run_test( test_run_manager.reset() with progress_context("Executing run_test()..."): - test_result = execute_test([test_case], metrics, False)[0] + test_result = execute_test_cases([test_case], metrics, False)[0] capture_evaluation_count() print_test_result(test_result) print("") @@ -132,7 +202,9 @@ def run_test( return test_result -def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]): +def assert_test( + test_case: LLMTestCase, metrics: List[BaseMetric], asynchronous: bool = True +): # TODO: refactor for metric in metrics: if not isinstance(metric, BaseMetric): @@ -142,9 +214,17 @@ def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]): if not isinstance(test_case, LLMTestCase): raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") - test_result = execute_test([test_case], metrics, get_is_running_deepeval())[ - 0 - ] + if asynchronous: + loop = get_or_create_event_loop() + test_result = loop.run_until_complete( + a_execute_test_cases( + [test_case], metrics, get_is_running_deepeval() + ) + )[0] + else: + test_result = execute_test_cases( + [test_case], metrics, get_is_running_deepeval() + )[0] if not test_result.success: failed_metrics = [ metric @@ -160,7 +240,11 @@ def assert_test(test_case: LLMTestCase, metrics: List[BaseMetric]): raise AssertionError(f"Metrics {failed_metrics_str} failed.") -def evaluate(test_cases: List[LLMTestCase], metrics: List[BaseMetric]): +def evaluate( + test_cases: List[LLMTestCase], + metrics: List[BaseMetric], + asynchronous: bool = True, +): # TODO: refactor for metric in metrics: if not isinstance(metric, BaseMetric): @@ -175,7 +259,13 @@ def evaluate(test_cases: List[LLMTestCase], metrics: List[BaseMetric]): test_run_manager.reset() with progress_context("Evaluating testcases..."): - test_results = execute_test(test_cases, metrics, True) + if asynchronous: + loop = get_or_create_event_loop() + test_results = loop.run_until_complete( + a_execute_test_cases(test_cases, metrics, True) + ) + else: + test_results = execute_test_cases(test_cases, metrics, True) capture_evaluation_count() for test_result in test_results: print_test_result(test_result) diff --git a/deepeval/integrations/hugging_face/callback.py b/deepeval/integrations/hugging_face/callback.py index 73a5543bc..355291675 100644 --- a/deepeval/integrations/hugging_face/callback.py +++ b/deepeval/integrations/hugging_face/callback.py @@ -3,7 +3,7 @@ from .rich_manager import RichManager from deepeval.metrics import BaseMetric -from deepeval.evaluate import execute_test +from deepeval.evaluate import execute_test_cases from deepeval.dataset import EvaluationDataset try: @@ -68,7 +68,7 @@ def _calculate_metric_scores(self) -> Dict[str, List[float]]: Returns: Dict[str, List[float]]: Metric scores for each test case. """ - test_results = execute_test( + test_results = execute_test_cases( test_cases=self.evaluation_dataset.test_cases, metrics=self.metrics, ) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 31ab8c368..3b5991c91 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -22,7 +22,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -32,18 +32,23 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.truths = self._generate_statements( test_case.retrieval_context @@ -52,36 +57,34 @@ def measure(self, test_case: LLMTestCase) -> float: self.verdicts = self._generate_verdicts() self.score = self._calculate_score() self.reason = self._generate_reason() - - self.success = self.score >= self.threshold - capture_metric_type(self.__name__) - return self.score - - def _calculate_score(self): - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - relevant_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() != "no": - relevant_count += 1 - - score = relevant_count / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score ################################ ###### Asynchronous logic ###### ################################ - async def a_measure(self, test_case: LLMTestCase): - self.statements: List[str] = await self._a_generate_statements( - test_case.actual_output - ) - self.verdicts: List[AnswerRelvancyVerdict] = ( - await self._a_generate_verdicts(test_case.input) - ) - self.score = self._calculate_score() - self.reason = await self._a_generate_reason(test_case.input) + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + self.statements: List[str] = await self._a_generate_statements( + test_case.actual_output + ) + self.verdicts: List[AnswerRelvancyVerdict] = ( + await self._a_generate_verdicts(test_case.input) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.input) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score async def _a_generate_reason(self, input: str) -> str: if self.include_reason is False: @@ -103,7 +106,6 @@ async def _a_generate_reason(self, input: str) -> str: async def _a_generate_verdicts( self, input: str ) -> List[AnswerRelvancyVerdict]: - print("generating verdicts") prompt = AnswerRelevancyTemplate.generate_verdicts( input=input, actual_output=self.statements, @@ -117,7 +119,6 @@ async def _a_generate_statements( self, actual_output: str, ) -> List[str]: - print("generating statements") prompt = AnswerRelevancyTemplate.generate_statements( actual_output=actual_output, ) @@ -129,6 +130,19 @@ async def _a_generate_statements( ############################### ###### Synchronous logic ###### ############################### + def _calculate_score(self): + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + relevant_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + relevant_count += 1 + + score = relevant_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + def _generate_reason(self, input: str) -> str: if self.include_reason is False: return None diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py index a05f2f1f0..669b38386 100644 --- a/deepeval/metrics/base_metric.py +++ b/deepeval/metrics/base_metric.py @@ -22,6 +22,12 @@ def threshold(self, value: float): def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: raise NotImplementedError + @abstractmethod + async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: + raise NotImplementedError( + f"Async execution for {self.__class__.__name__} not supported yet. Please turn set 'asynchronous' to 'False'." + ) + @abstractmethod def is_successful(self) -> bool: raise NotImplementedError diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 039bc1423..490dcae27 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -22,7 +22,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -32,19 +32,24 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self._a_execute_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output @@ -52,19 +57,30 @@ def measure(self, test_case: LLMTestCase): self.verdicts: List[BiasVerdict] = self._generate_verdicts() self.score = self._calculate_score() self.reason = self._generate_reason() + self.success = self.score <= self.threshold + capture_metric_type(self.__name__) + return self.score + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + self.opinions: List[str] = await self._a_generate_opinions( + test_case.actual_output + ) + self.verdicts: List[BiasVerdict] = await self._a_generate_verdicts() + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.opinions: List[str] = await self._a_generate_opinions( - test_case.actual_output - ) - self.verdicts: List[BiasVerdict] = await self._a_generate_verdicts() - self.score = self._calculate_score() - self.reason = await self._a_generate_reason() - def _calculate_score(self) -> float: number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 0c2e507fa..938ccea8d 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -23,7 +23,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -33,7 +33,7 @@ def __init__( else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -48,12 +48,17 @@ def measure(self, test_case: LLMTestCase) -> float: ) with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[ContextualPrecisionVerdict] = ( self._generate_verdicts( @@ -64,23 +69,33 @@ def measure(self, test_case: LLMTestCase) -> float: ) self.score = self._calculate_score() self.reason = self._generate_reason(test_case.input) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + self.verdicts: List[ContextualPrecisionVerdict] = ( + await self._a_generate_verdicts( + test_case.input, + test_case.expected_output, + test_case.retrieval_context, + ) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.input) self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.verdicts: List[ContextualPrecisionVerdict] = ( - await self._a_generate_verdicts( - test_case.input, - test_case.expected_output, - test_case.retrieval_context, - ) - ) - - self.score = self._calculate_score() - self.reason = await self._a_generate_reason(test_case.input) - async def _a_generate_reason(self, input: str): if self.include_reason is False: return None diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 19c87e7ed..324738bb1 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -21,7 +21,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -31,7 +31,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -45,11 +45,16 @@ def measure(self, test_case: LLMTestCase) -> float: "Input, actual output, expected output, or retrieval context cannot be None" ) with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[ContextualRecallVerdict] = ( self._generate_verdicts( @@ -58,20 +63,34 @@ def measure(self, test_case: LLMTestCase) -> float: ) self.score = self._calculate_score() self.reason = self._generate_reason(test_case.expected_output) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + print("a contextual recall") + self.verdicts: List[ContextualRecallVerdict] = ( + await self._a_generate_verdicts( + test_case.expected_output, test_case.retrieval_context + ) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason( + test_case.expected_output + ) self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.verdicts: List[ContextualRecallVerdict] = ( - await self._a_generate_verdicts( - test_case.expected_output, test_case.retrieval_context - ) - ) - self.score = self._calculate_score() - self.reason = await self._a_generate_reason(test_case.expected_output) - async def _a_generate_reason(self, expected_output: str): if self.include_reason is False: return None diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 58fa3a486..e3c9efbde 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -1,9 +1,10 @@ +import asyncio from typing import Optional, List, Union from pydantic import BaseModel, Field from threading import Lock from concurrent.futures import ThreadPoolExecutor, as_completed -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, get_or_create_event_loop from deepeval.test_case import LLMTestCase from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -16,7 +17,7 @@ class ContextualRelevancyVerdict(BaseModel): verdict: str - sentence: str = Field(default=None) + reason: str = Field(default=None) class ContextualRelevancyMetric(BaseMetric): @@ -25,7 +26,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - multithreading: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -35,7 +36,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.multithreading = multithreading + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -48,120 +49,132 @@ def measure(self, test_case: LLMTestCase) -> float: "Input, actual output, or retrieval context cannot be None" ) with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - self.verdicts_list: List[List[ContextualRelevancyVerdict]] = ( - self._generate_verdicts_list( + if self.asynchronous: + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) + else: + self.verdicts: List[ContextualRelevancyVerdict] = ( + self._generate_verdicts( + test_case.input, test_case.retrieval_context + ) + ) + self.score = self._calculate_score() + self.reason = self._generate_reason(test_case.input) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score + + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + print("a contextual relevancy") + self.verdicts: List[ContextualRelevancyVerdict] = ( + await self._a_generate_verdicts( test_case.input, test_case.retrieval_context ) ) - contextual_recall_score = self._calculate_score() - - self.reason = self._generate_reason( - test_case.input, contextual_recall_score - ) - - self.success = contextual_recall_score >= self.threshold - self.score = contextual_recall_score + self.score = self._calculate_score() + self.reason = await self._a_generate_reason(test_case.input) + self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - def _generate_reason(self, input: str, score: float): + async def _a_generate_reason(self, input: str): if self.include_reason is False: return None - irrelevant_sentences = [] - for index, verdicts in enumerate(self.verdicts_list): - for verdict in verdicts: - if verdict.verdict.strip().lower() == "no": - data = {"Node": index + 1, "Sentence": verdict.sentence} - irrelevant_sentences.append(data) + irrelevancies = [] + for verdict in self.verdicts: + if verdict.verdict.lower() == "no": + irrelevancies.append(verdict.reason) prompt: dict = ContextualRelevancyTemplate.generate_reason( input=input, - irrelevant_sentences=irrelevant_sentences, - score=format(score, ".2f"), + irrelevancies=irrelevancies, + score=format(self.score, ".2f"), ) + res = await self.model.a_generate(prompt) + return res + + def _generate_reason(self, input: str): + if self.include_reason is False: + return None - res = self.model(prompt) + irrelevancies = [] + for verdict in self.verdicts: + if verdict.verdict.lower() == "no": + irrelevancies.append(verdict.reason) + + prompt: dict = ContextualRelevancyTemplate.generate_reason( + input=input, + irrelevancies=irrelevancies, + score=format(self.score, ".2f"), + ) + res = self.model.generate(prompt) return res def _calculate_score(self): - irrelevant_sentences = 0 - total_sentence_count = 0 - for verdicts in self.verdicts_list: - for verdict in verdicts: - total_sentence_count += 1 - if verdict.verdict.lower() == "no": - irrelevant_sentences += 1 - - if total_sentence_count == 0: + total_verdicts = len(self.verdicts) + if total_verdicts == 0: return 0 - score = ( - total_sentence_count - irrelevant_sentences - ) / total_sentence_count + relevant_nodes = 0 + for verdict in self.verdicts: + if verdict.verdict.lower() == "yes": + relevant_nodes += 1 + score = relevant_nodes / total_verdicts return 0 if self.strict_mode and score < self.threshold else score - def _generate_verdicts( - self, - text: str, - context: str, - verdicts_list: List[List[ContextualRelevancyVerdict]], - lock: Lock, - ): - prompt = ContextualRelevancyTemplate.generate_verdicts( - text=text, context=context - ) - - res = self.model(prompt) - data = trimAndLoadJson(res) - verdicts = [ - ContextualRelevancyVerdict(**item) for item in data["verdicts"] - ] + async def _a_generate_verdicts( + self, text: str, retrieval_context: List[str] + ) -> ContextualRelevancyVerdict: + tasks = [] + for context in retrieval_context: + prompt = ContextualRelevancyTemplate.generate_verdict( + text=text, context=context + ) + task = asyncio.create_task(self.model.a_generate(prompt)) + tasks.append(task) - with lock: - verdicts_list.append(verdicts) + results = await asyncio.gather(*tasks) - def _generate_verdicts_list( - self, text: str, retrieval_context: List[str] - ) -> List[List[ContextualRelevancyVerdict]]: - verdicts_list: List[List[ContextualRelevancyVerdict]] = [] - - if self.multithreading: - lock = Lock() - - with ThreadPoolExecutor() as executor: - futures = { - executor.submit( - self._generate_verdicts, - text, - context, - verdicts_list, - lock, - ): context - for context in retrieval_context - } - - for future in as_completed(futures): - future.result() + verdicts = [] + for res in results: + data = trimAndLoadJson(res) + verdict = ContextualRelevancyVerdict(**data) + verdicts.append(verdict) - else: - for context in retrieval_context: - prompt = ContextualRelevancyTemplate.generate_verdicts( - text=text, context=context - ) + return verdicts - res = self.model(prompt) - data = trimAndLoadJson(res) - verdicts = [ - ContextualRelevancyVerdict(**item) - for item in data["verdicts"] - ] - verdicts_list.append(verdicts) + def _generate_verdicts( + self, text: str, retrieval_context: List[str] + ) -> List[ContextualRelevancyVerdict]: + verdicts: List[ContextualRelevancyVerdict] = [] + for context in retrieval_context: + prompt = ContextualRelevancyTemplate.generate_verdict( + text=text, context=context + ) + res = self.model.generate(prompt) + data = trimAndLoadJson(res) + verdict = ContextualRelevancyVerdict(**data) + verdicts.append(verdict) - return verdicts_list + return verdicts def is_successful(self) -> bool: self.success = self.score >= self.threshold diff --git a/deepeval/metrics/contextual_relevancy/template.py b/deepeval/metrics/contextual_relevancy/template.py index 4672bdda0..8fe5b6e17 100644 --- a/deepeval/metrics/contextual_relevancy/template.py +++ b/deepeval/metrics/contextual_relevancy/template.py @@ -1,9 +1,8 @@ class ContextualRelevancyTemplate: @staticmethod - def generate_reason(input, irrelevant_sentences, score): - return f"""Based on the given input, irrelevant sentences (list of JSON), and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score. -Irrelevant Sentences will contain JSONs with two keys: `sentence` and `node`. `sentence` is the actual sentence itself, and `node` is the node number from the `retrieval context` which it was drawn from. Specify that nodes are from retrieval context the first time you mention it. -In your reason, you should use data in the irrelevant sentences to support your point. + def generate_reason(input, irrelevancies, score): + return f"""Based on the given input, reasons for why the retrieval context is irrelevant to the input, and the contextual relevancy score (the closer to 1 the better), please generate a CONCISE reason for the score. +In your reason, you should quote data provided in the reasons for irrelevancy to support your point. Contextual Relevancy Score: {score} @@ -11,8 +10,8 @@ def generate_reason(input, irrelevant_sentences, score): Input: {input} -Irrelevant Sentences: -{irrelevant_sentences} +Reasons for why the retrieval context is irrelevant to the input: +{irrelevancies} Example: The score is because . @@ -26,30 +25,20 @@ def generate_reason(input, irrelevant_sentences, score): """ @staticmethod - def generate_verdicts(text, context): - return f"""Based on the input and context, please generate a list of JSON objects to indicate whether each given sentence in the context relevant to the provided input. The JSON will have 1 mandatory field: 'verdict', and 1 optional field: 'sentence'. -The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the sentence is relevant to the text. -Copy the sentence and supply the value to the 'sentence' key ONLY IF verdict is no. + def generate_verdict(text, context): + return f"""Based on the input and context, please generate a JSON object to indicate whether the context is relevant to the provided input. The JSON will have 1 mandatory field: 'verdict', and 1 optional field: 'reason'. +The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether the context is relevant to the input. +Provide a 'reason' ONLY IF verdict is no. You MUST quote the irrelevant parts of the context to back up your reason. ** -IMPORTANT: Please make sure to only return in JSON format, with the 'verdicts' key as a list of JSON objects. +IMPORTANT: Please make sure to only return in JSON format. Example Context: "Einstein won the Nobel Prize for his discovery of the photoelectric effect. He won the Nobel Prize in 1968. There was a cat." -Example Input: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect." +Example Input: "When what was some of Einstein's achievements?" Example: {{ - "verdicts": [ - {{ - "verdict": "yes" - }}, - {{ - "verdict": "yes" - }}, - {{ - "verdict": "no", - "sentence": "There was a cat" - }} - ] + "verdict": "no", + "sentence": "Although the context contains information about Einstein winning the Nobel Prize, it irrelevantly includes 'There was a cat' when it has nothing to do with Einstein's achievements." }} ** diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 3f10773b5..3e88642ee 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -22,7 +22,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -32,10 +32,10 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -46,48 +46,49 @@ def measure(self, test_case: LLMTestCase): ) with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.truths = self._generate_truths(test_case.retrieval_context) self.claims = self._generate_claims(test_case.actual_output) self.verdicts = self._generate_verdicts() self.score = self._calculate_score() self.reason = self._generate_reason() - - self.success = self.score >= self.threshold - capture_metric_type(self.__name__) - return self.score - - def _calculate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - faithfulness_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() != "no": - faithfulness_count += 1 - - score = faithfulness_count / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score ################################ ###### Asynchronous logic ###### ################################ - async def a_measure(self, test_case: LLMTestCase): - self.truths, self.claims = await asyncio.gather( - self._a_generate_truths(test_case.retrieval_context), - self._a_generate_claims(test_case.actual_output), - ) - print("generate verdicts") - self.verdicts = await self._a_generate_verdicts() - self.score = self._calculate_score() - print("generate reasons") - self.reason = await self._a_generate_reason() + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + self.truths, self.claims = await asyncio.gather( + self._a_generate_truths(test_case.retrieval_context), + self._a_generate_claims(test_case.actual_output), + ) + self.verdicts = await self._a_generate_verdicts() + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score async def _a_generate_reason(self) -> str: if self.include_reason is False: @@ -117,7 +118,6 @@ async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: return verdicts async def _a_generate_truths(self, retrieval_context: str) -> List[str]: - print("generating truths") prompt = FaithfulnessTemplate.generate_claims( text="\n\n".join(retrieval_context) ) @@ -126,7 +126,6 @@ async def _a_generate_truths(self, retrieval_context: str) -> List[str]: return data["claims"] async def _a_generate_claims(self, actual_output: str) -> List[str]: - print("generating claims") prompt = FaithfulnessTemplate.generate_claims(text=actual_output) res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) @@ -135,6 +134,19 @@ async def _a_generate_claims(self, actual_output: str) -> List[str]: ############################### ###### Synchronous logic ###### ############################### + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + faithfulness_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + faithfulness_count += 1 + + score = faithfulness_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + def _generate_reason(self) -> str: if self.include_reason is False: return None @@ -163,7 +175,6 @@ def _generate_verdicts(self) -> List[FaithfulnessVerdict]: return verdicts def _generate_truths(self, retrieval_context: str) -> List[str]: - print("generating truths") prompt = FaithfulnessTemplate.generate_claims( text="\n\n".join(retrieval_context) ) diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 2344813f3..4fc37f259 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -24,7 +24,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = False, + asynchronous: bool = False, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -34,7 +34,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -45,11 +45,16 @@ def measure(self, test_case: LLMTestCase): ): raise ValueError("Input, actual output, or context cannot be None") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[HallucinationVerdict] = ( self._generate_verdicts( @@ -58,20 +63,32 @@ def measure(self, test_case: LLMTestCase): ) self.score = self._calculate_score() self.reason = self._generate_reason() + self.success = self.score <= self.threshold + capture_metric_type(self.__name__) + return self.score + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + print("a hallucination") + self.verdicts: List[HallucinationVerdict] = ( + await self._a_generate_verdicts( + test_case.actual_output, test_case.context + ) + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() self.success = self.score <= self.threshold capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.verdicts: List[HallucinationVerdict] = ( - await self._a_generate_verdicts( - test_case.actual_output, test_case.context - ) - ) - self.score = self._calculate_score() - self.reason = await self._a_generate_reason() - async def _a_generate_reason(self): if self.include_reason is False: return None diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index 5ea6ddc3d..f6428cf49 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -44,7 +44,9 @@ def measure(self, test_case: ConversationalTestCase): raise ValueError("Messages cannot be empty") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, ): self.knowledges: List[Knowledge] = self._generate_knowledges( test_case diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 628e413fc..7f02da7f6 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -39,7 +39,7 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, assessment_questions: Optional[List[str]] = None, include_reason: bool = True, - run_async=True, + asynchronous=True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -54,7 +54,7 @@ def __init__( else: self.assessment_questions = assessment_questions - self.run_async = run_async + self.asynchronous = asynchronous self.include_reason = include_reason self.n = n self.strict_mode = strict_mode @@ -64,11 +64,16 @@ def measure(self, test_case: LLMTestCase): raise ValueError("Input or actual output cannot be None") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.truths: List[str] = self._generate_claims(test_case.input) self.claims: List[str] = self._generate_claims( @@ -88,29 +93,43 @@ def measure(self, test_case: LLMTestCase): } self.score = min(alignment_score, coverage_score) self.reason = self._generate_reason() + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + print("a summarization") + self.truths, self.claims = await asyncio.gather( + self._a_generate_claims(test_case.input), + self._a_generate_claims(test_case.actual_output), + ) + self.coverage_verdicts, self.alignment_verdicts = ( + await asyncio.gather( + self._a_generate_coverage_verdicts(test_case), + self._a_generate_alignment_verdicts(), + ) + ) + alignment_score = self._calculate_score(ScoreType.ALIGNMENT) + coverage_score = self._calculate_score(ScoreType.COVERAGE) + self.score_breakdown = { + ScoreType.ALIGNMENT.value: alignment_score, + ScoreType.COVERAGE.value: coverage_score, + } + self.score = min(alignment_score, coverage_score) + self.reason = await self._a_generate_reason() self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.truths, self.claims = await asyncio.gather( - self._a_generate_claims(test_case.input), - self._a_generate_claims(test_case.actual_output), - ) - self.coverage_verdicts, self.alignment_verdicts = await asyncio.gather( - self._a_generate_coverage_verdicts(test_case), - self._a_generate_alignment_verdicts(), - ) - alignment_score = self._calculate_score(ScoreType.ALIGNMENT) - coverage_score = self._calculate_score(ScoreType.COVERAGE) - self.score_breakdown = { - ScoreType.ALIGNMENT.value: alignment_score, - ScoreType.COVERAGE.value: coverage_score, - } - self.score = min(alignment_score, coverage_score) - self.reason = await self._a_generate_reason() - async def _a_generate_reason(self) -> str: if self.include_reason is False: return None diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 9d33509eb..5e6383f30 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -23,7 +23,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + asynchronous: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -33,7 +33,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.asynchronous = asynchronous self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase): @@ -41,11 +41,16 @@ def measure(self, test_case: LLMTestCase): raise ValueError("Input or actual output cannot be None") with metrics_progress_context( - self.__name__, self.evaluation_model, self.strict_mode + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, ): - if self.run_async: + if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output @@ -53,20 +58,34 @@ def measure(self, test_case: LLMTestCase): self.verdicts: List[ToxicityVerdict] = self._generate_verdicts() self.score = self._calculate_score() self.reason = self._generate_reason() - + self.success = self.score <= self.threshold + self.score = self.score + capture_metric_type(self.__name__) + return self.score + + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: + with metrics_progress_context( + self.__name__, + self.evaluation_model, + self.strict_mode, + self.asynchronous, + _show_indicator, + ): + self.opinions: List[str] = await self._a_generate_opinions( + test_case.actual_output + ) + self.verdicts: List[ToxicityVerdict] = ( + await self._a_generate_verdicts() + ) + self.score = self._calculate_score() + self.reason = await self._a_generate_reason() self.success = self.score <= self.threshold self.score = self.score capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): - self.opinions: List[str] = await self._a_generate_opinions( - test_case.actual_output - ) - self.verdicts: List[ToxicityVerdict] = await self._a_generate_verdicts() - self.score = self._calculate_score() - self.reason = await self._a_generate_reason() - async def _a_generate_reason(self) -> str: if self.include_reason is False: return None diff --git a/deepeval/progress_context.py b/deepeval/progress_context.py index d9dccecbf..52d59a471 100644 --- a/deepeval/progress_context.py +++ b/deepeval/progress_context.py @@ -24,18 +24,23 @@ def metrics_progress_context( metric_name: str, evaluation_model: str, strict_mode: bool, + asynchronous: bool, + show_indicator: bool = True, total: int = 9999, transient: bool = True, ): - description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict_mode={strict_mode})! This may take a minute..." + description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict={strict_mode}, async={asynchronous})! This may take a minute..." console = Console(file=sys.stderr) # Direct output to standard error - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - console=console, # Use the custom console - transient=transient, - ) as progress: - progress.add_task(description=description, total=total) + if show_indicator: + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + console=console, # Use the custom console + transient=transient, + ) as progress: + progress.add_task(description=description, total=total) + yield + else: yield diff --git a/tests/test_everything.py b/tests/test_everything.py index 22a7d5afc..1e3a394b5 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -76,12 +76,12 @@ def test_everything(): metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) - metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) - metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) + # metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) + # metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) # metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) metric6 = BiasMetric(threshold=0.5, strict_mode=strict_mode) - metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) - metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) + # metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) + # metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) # metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) # metric10 = GEval( # name="Coherence", @@ -105,12 +105,57 @@ def test_everything(): [ metric1, metric2, - metric3, - metric4, + # metric3, + # metric4, + # metric5, + # metric6, + # metric7, + # metric8, + # metric9, + # metric10, + ], + ) + + +@pytest.mark.skip(reason="openai is expensive") +def test_everything_2(): + metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) + # metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) + # metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) + # metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric6 = BiasMetric(threshold=0.5, strict_mode=strict_mode) + # metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) + # metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) + # metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) + # metric10 = GEval( + # name="Coherence", + # criteria="Coherence - determine if the actual output is coherent with the input.", + # evaluation_params=[ + # LLMTestCaseParams.INPUT, + # LLMTestCaseParams.ACTUAL_OUTPUT, + # ], + # strict_mode=True, + # ) + + test_case = LLMTestCase( + input=question, + actual_output=answer, + expected_output=answer, + retrieval_context=[one, two, three], + context=[four, five], + ) + assert_test( + test_case, + [ + metric1, + metric2, + # metric3, + # metric4, # metric5, metric6, - metric7, - metric8, + # metric7, + # metric8, # metric9, # metric10, ], diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index 3be4aed1c..8c693a334 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -46,27 +46,5 @@ def test_faithfulness(): actual_output=output, retrieval_context=[one, two, three], ) - metric = FaithfulnessMetric(run_async=False) + metric = FaithfulnessMetric(asynchronous=False) assert_test(test_case, [metric]) - - -# test_case = LLMTestCase( -# input="What is the primary difference between a comet and an asteroid?", -# actual_output=output, -# retrieval_context=[one, two, three], -# ) - - -# async def example(): -# metric1 = FaithfulnessMetric(run_async=True) -# metric2 = FaithfulnessMetric(run_async=True) -# metric3 = FaithfulnessMetric(run_async=True) -# a = await asyncio.gather( -# metric1._a_execute_measure(test_case), -# metric2._a_execute_measure(test_case), -# metric3._a_execute_measure(test_case), -# ) -# return a - - -# asyncio.run(example()) diff --git a/tests/test_hallucination.py b/tests/test_hallucination.py index 5b34b2c06..459de1133 100644 --- a/tests/test_hallucination.py +++ b/tests/test_hallucination.py @@ -5,7 +5,7 @@ import deepeval -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric(): metric = HallucinationMetric(threshold=0.5) test_case = LLMTestCase( @@ -20,7 +20,7 @@ def test_hallucination_metric(): assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_2(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( @@ -33,7 +33,7 @@ def test_hallucination_metric_2(): assert_test(test_case, [metric]) -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_hallucination_metric_3(): metric = HallucinationMetric(threshold=0.6) test_case = LLMTestCase( diff --git a/tests/test_summarization.py b/tests/test_summarization.py index 86fce12f8..f330a152a 100644 --- a/tests/test_summarization.py +++ b/tests/test_summarization.py @@ -4,7 +4,7 @@ from deepeval.metrics import SummarizationMetric -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_summarization(): metric = SummarizationMetric() diff --git a/tests/test_toxic.py b/tests/test_toxic.py index eb0fdf848..ac6cb84a4 100644 --- a/tests/test_toxic.py +++ b/tests/test_toxic.py @@ -11,7 +11,7 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_bias(): test_case = LLMTestCase( input="What is the primary difference between a comet and an asteroid?", From d26041dab2882fcb176ba687d6c496b167eece4f Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 17:22:04 +0800 Subject: [PATCH 32/59] ragas async placeholder --- deepeval/metrics/cost.py | 6 ++++++ deepeval/metrics/latency.py | 6 ++++++ deepeval/metrics/ragas.py | 33 +++++++++++++++++++++++++++++++++ tests/test_ragas.py | 2 +- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py index 28d2d63c5..08a635d72 100644 --- a/deepeval/metrics/cost.py +++ b/deepeval/metrics/cost.py @@ -13,6 +13,12 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.success = test_case.cost <= self.threshold + self.score = test_case.cost + capture_metric_type(self.__name__) + return self.score + def is_successful(self): return self.success diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py index e04cfc6db..27d45aef7 100644 --- a/deepeval/metrics/latency.py +++ b/deepeval/metrics/latency.py @@ -13,6 +13,12 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + self.success = test_case.latency <= self.threshold + self.score = test_case.latency + capture_metric_type(self.__name__) + return self.score + def is_successful(self): return self.success diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py index a78f621f6..9809af1b4 100644 --- a/deepeval/metrics/ragas.py +++ b/deepeval/metrics/ragas.py @@ -65,6 +65,9 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def is_successful(self): return self.success @@ -85,6 +88,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: @@ -144,6 +150,9 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.embeddings = embeddings + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: @@ -200,6 +209,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: @@ -252,6 +264,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: @@ -305,6 +320,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: @@ -359,6 +377,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate @@ -411,6 +432,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate @@ -464,6 +488,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate @@ -517,6 +544,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate @@ -572,6 +602,9 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.embeddings = embeddings + async def a_measure(self, test_case: LLMTestCase): + return self.measure(test_case) + def measure(self, test_case: LLMTestCase): # sends to server try: diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 4f5046ebe..5c1b9ddf8 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -75,6 +75,6 @@ def test_everything(): # metric7, metric8, metric9, - # metric10, + metric10, ], ) From 720667cf38a9ac2bfd85edefd455b5211fedd2a4 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 18:35:31 +0800 Subject: [PATCH 33/59] added async indicator --- deepeval/evaluate.py | 28 +---- .../answer_relevancy/answer_relevancy.py | 111 ++++++++++-------- deepeval/metrics/base_metric.py | 2 + deepeval/metrics/bias/bias.py | 46 +++++--- .../contextual_precision.py | 72 ++++++------ .../contextual_recall/contextual_recall.py | 13 +- .../contextual_relevancy.py | 13 +- deepeval/metrics/cost.py | 6 +- deepeval/metrics/faithfulness/faithfulness.py | 104 ++++++++-------- .../metrics/hallucination/hallucination.py | 41 ++++--- deepeval/metrics/latency.py | 6 +- deepeval/metrics/ragas.py | 46 ++++++-- .../metrics/summarization/summarization.py | 19 ++- deepeval/metrics/toxicity/toxicity.py | 47 +++++--- 14 files changed, 310 insertions(+), 244 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 238985728..55a09c968 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -79,11 +79,9 @@ def execute_test_cases( success = True api_test_case: APITestCase = create_api_test_case(test_case, index) test_start_time = time.perf_counter() - for metric in metrics: - - # Long blocking I/O process - metric.measure(test_case) + for metric in metrics: + metric.measure(test_case, _asynchronous=False) metric_metadata = MetricsMetadata( metric=metric.__name__, score=metric.score, @@ -93,7 +91,6 @@ def execute_test_cases( evaluationModel=metric.evaluation_model, ) api_test_case.metrics_metadata.append(metric_metadata) - if metric_metadata.success is False: success = False @@ -124,22 +121,7 @@ async def a_execute_test_cases( test_run_manager.save_to_disk = save_to_disk for index, test_case in enumerate(test_cases): success = True - api_test_case: APITestCase = APITestCase( - name=os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{index}"), - input=test_case.input, - actualOutput=test_case.actual_output, - expectedOutput=test_case.expected_output, - success=success, - metricsMetadata=[], - runDuration=0, - latency=test_case.latency, - cost=test_case.cost, - context=test_case.context, - retrievalContext=test_case.retrieval_context, - traceStack=get_trace_stack(), - id=test_case.id, - ) - + api_test_case: APITestCase = create_api_test_case(test_case, index) test_start_time = time.perf_counter() # Run metrics concurrently using asyncio.gather @@ -283,11 +265,11 @@ def print_test_result(test_result: TestResult): for metric in test_result.metrics: if not metric.is_successful(): print( - f" - ❌ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})" + f" - ❌ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, strict: {metric.strict_mode}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})" ) else: print( - f" - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})" + f" - ✅ {metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, strict: {metric.strict_mode}, evaluation model: {metric.evaluation_model}, reason: {metric.reason})" ) if metric.score_breakdown: for metric_name, score in metric.score_breakdown.items(): diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 3b5991c91..e862eeb11 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -35,35 +35,38 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) ) else: - self.truths = self._generate_statements( - test_case.retrieval_context + self.statements: List[str] = self._generate_statements( + test_case.actual_output + ) + self.verdicts: List[AnswerRelvancyVerdict] = ( + self._generate_verdicts(test_case.input) ) - self.claims = self._generate_verdicts(test_case.actual_output) - self.verdicts = self._generate_verdicts() self.score = self._calculate_score() - self.reason = self._generate_reason() + self.reason = self._generate_reason(test_case.input) self.success = self.score >= self.threshold capture_metric_type(self.__name__) return self.score - ################################ - ###### Asynchronous logic ###### - ################################ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: @@ -71,7 +74,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): self.statements: List[str] = await self._a_generate_statements( @@ -103,46 +106,6 @@ async def _a_generate_reason(self, input: str) -> str: res = await self.model.a_generate(prompt) return res - async def _a_generate_verdicts( - self, input: str - ) -> List[AnswerRelvancyVerdict]: - prompt = AnswerRelevancyTemplate.generate_verdicts( - input=input, - actual_output=self.statements, - ) - res = await self.model.a_generate(prompt) - data = trimAndLoadJson(res) - verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] - return verdicts - - async def _a_generate_statements( - self, - actual_output: str, - ) -> List[str]: - prompt = AnswerRelevancyTemplate.generate_statements( - actual_output=actual_output, - ) - - res = await self.model.a_generate(prompt) - data = trimAndLoadJson(res) - return data["statements"] - - ############################### - ###### Synchronous logic ###### - ############################### - def _calculate_score(self): - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - relevant_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() != "no": - relevant_count += 1 - - score = relevant_count / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score - def _generate_reason(self, input: str) -> str: if self.include_reason is False: return None @@ -160,7 +123,25 @@ def _generate_reason(self, input: str) -> str: res = self.model.generate(prompt) return res + async def _a_generate_verdicts( + self, input: str + ) -> List[AnswerRelvancyVerdict]: + if len(self.statements) == 0: + return [] + + prompt = AnswerRelevancyTemplate.generate_verdicts( + input=input, + actual_output=self.statements, + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] + return verdicts + def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: + if len(self.statements) == 0: + return [] + prompt = AnswerRelevancyTemplate.generate_verdicts( input=input, actual_output=self.statements, @@ -170,6 +151,17 @@ def _generate_verdicts(self, input: str) -> List[AnswerRelvancyVerdict]: verdicts = [AnswerRelvancyVerdict(**item) for item in data["verdicts"]] return verdicts + async def _a_generate_statements( + self, + actual_output: str, + ) -> List[str]: + prompt = AnswerRelevancyTemplate.generate_statements( + actual_output=actual_output, + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + return data["statements"] + def _generate_statements( self, actual_output: str, @@ -181,6 +173,21 @@ def _generate_statements( data = trimAndLoadJson(res) return data["statements"] + def _calculate_score(self): + print(self.statements) + print(self.verdicts) + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 1 + + relevant_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + relevant_count += 1 + + score = relevant_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py index 669b38386..981e66083 100644 --- a/deepeval/metrics/base_metric.py +++ b/deepeval/metrics/base_metric.py @@ -9,6 +9,8 @@ class BaseMetric: score_breakdown: Dict = None reason: Optional[str] = None evaluation_model: Optional[str] = None + strict_mode: bool = False + asynchronous: Optional[bool] = None @property def threshold(self) -> float: diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 490dcae27..6a74c847c 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -35,17 +35,21 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -68,7 +72,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): self.opinions: List[str] = await self._a_generate_opinions( @@ -81,19 +85,6 @@ async def a_measure( capture_metric_type(self.__name__) return self.score - def _calculate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - bias_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() == "yes": - bias_count += 1 - - score = bias_count / number_of_verdicts - return 1 if self.strict_mode and score > self.threshold else score - async def _a_generate_reason(self) -> str: if self.include_reason is False: return None @@ -127,6 +118,9 @@ def _generate_reason(self) -> str: return res async def _a_generate_verdicts(self) -> List[BiasVerdict]: + if len(self.opinions) == 0: + return [] + verdicts: List[BiasVerdict] = [] prompt = BiasTemplate.generate_verdicts(opinions=self.opinions) res = await self.model.a_generate(prompt) @@ -135,6 +129,9 @@ async def _a_generate_verdicts(self) -> List[BiasVerdict]: return verdicts def _generate_verdicts(self) -> List[BiasVerdict]: + if len(self.opinions) == 0: + return [] + verdicts: List[BiasVerdict] = [] prompt = BiasTemplate.generate_verdicts(opinions=self.opinions) res = self.model.generate(prompt) @@ -154,6 +151,19 @@ def _generate_opinions(self, actual_output: str) -> List[str]: data = trimAndLoadJson(res) return data["opinions"] + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + bias_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "yes": + bias_count += 1 + + score = bias_count / number_of_verdicts + return 1 if self.strict_mode and score > self.threshold else score + def is_successful(self) -> bool: return self.success diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 938ccea8d..2c4f51dec 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -36,7 +36,9 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -46,15 +48,17 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, expected output, or retrieval context cannot be None" ) - + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -80,7 +84,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): self.verdicts: List[ContextualPrecisionVerdict] = ( @@ -109,7 +113,6 @@ async def _a_generate_reason(self, input: str): verdicts=retrieval_contexts_verdicts, score=format(self.score, ".2f"), ) - res = await self.model.a_generate(prompt) return res @@ -126,40 +129,9 @@ def _generate_reason(self, input: str): verdicts=retrieval_contexts_verdicts, score=format(self.score, ".2f"), ) - res = self.model.generate(prompt) return res - def _calculate_score(self): - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - # Convert verdicts to a binary list where 'yes' is 1 and others are 0 - node_verdicts = [ - 1 if v.verdict.strip().lower() == "yes" else 0 - for v in self.verdicts - ] - - sum_weighted_precision_at_k = 0.0 - relevant_nodes_count = 0 - - # Go through each item in the response - for k, is_relevant in enumerate(node_verdicts, start=1): - # If the item is relevant, update the counter and add the weighted precision at k to the sum - if is_relevant: - relevant_nodes_count += 1 - precision_at_k = relevant_nodes_count / k - sum_weighted_precision_at_k += precision_at_k * is_relevant - - if relevant_nodes_count == 0: - return 0 - - # Calculate weighted cumulative precision - score = sum_weighted_precision_at_k / relevant_nodes_count - - return 0 if self.strict_mode and score < self.threshold else score - async def _a_generate_verdicts( self, input: str, expected_output: str, retrieval_context: List[str] ) -> List[ContextualPrecisionVerdict]: @@ -190,6 +162,32 @@ def _generate_verdicts( ] return verdicts + def _calculate_score(self): + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + # Convert verdicts to a binary list where 'yes' is 1 and others are 0 + node_verdicts = [ + 1 if v.verdict.strip().lower() == "yes" else 0 + for v in self.verdicts + ] + + sum_weighted_precision_at_k = 0.0 + relevant_nodes_count = 0 + for k, is_relevant in enumerate(node_verdicts, start=1): + # If the item is relevant, update the counter and add the weighted precision at k to the sum + if is_relevant: + relevant_nodes_count += 1 + precision_at_k = relevant_nodes_count / k + sum_weighted_precision_at_k += precision_at_k * is_relevant + + if relevant_nodes_count == 0: + return 0 + # Calculate weighted cumulative precision + score = sum_weighted_precision_at_k / relevant_nodes_count + return 0 if self.strict_mode and score < self.threshold else score + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 324738bb1..718529bea 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -34,7 +34,9 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -44,13 +46,16 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, expected output, or retrieval context cannot be None" ) + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -74,7 +79,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): print("a contextual recall") diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index e3c9efbde..0a074ba92 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -39,7 +39,9 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -48,13 +50,16 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, or retrieval context cannot be None" ) + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -78,7 +83,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): print("a contextual relevancy") diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py index 08a635d72..dfc3310e4 100644 --- a/deepeval/metrics/cost.py +++ b/deepeval/metrics/cost.py @@ -1,3 +1,5 @@ +from typing import Optional + from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase from deepeval.telemetry import capture_metric_type @@ -7,7 +9,9 @@ class CostMetric(BaseMetric): def __init__(self, max_cost: float): self.threshold = max_cost - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): self.success = test_case.cost <= self.threshold self.score = test_case.cost capture_metric_type(self.__name__) diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 3e88642ee..59427a693 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -35,7 +35,9 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase) -> float: + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -44,14 +46,16 @@ def measure(self, test_case: LLMTestCase) -> float: raise ValueError( "Input, actual output, and retrieval context cannot be None" ) - + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -66,9 +70,6 @@ def measure(self, test_case: LLMTestCase) -> float: capture_metric_type(self.__name__) return self.score - ################################ - ###### Asynchronous logic ###### - ################################ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: @@ -76,7 +77,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): self.truths, self.claims = await asyncio.gather( @@ -106,47 +107,6 @@ async def _a_generate_reason(self) -> str: res = await self.model.a_generate(prompt) return res - async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: - verdicts: List[FaithfulnessVerdict] = [] - - prompt = FaithfulnessTemplate.generate_verdicts( - claims=self.claims, retrieval_context="\n\n".join(self.truths) - ) - res = await self.model.a_generate(prompt) - data = trimAndLoadJson(res) - verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] - return verdicts - - async def _a_generate_truths(self, retrieval_context: str) -> List[str]: - prompt = FaithfulnessTemplate.generate_claims( - text="\n\n".join(retrieval_context) - ) - res = await self.model.a_generate(prompt) - data = trimAndLoadJson(res) - return data["claims"] - - async def _a_generate_claims(self, actual_output: str) -> List[str]: - prompt = FaithfulnessTemplate.generate_claims(text=actual_output) - res = await self.model.a_generate(prompt) - data = trimAndLoadJson(res) - return data["claims"] - - ############################### - ###### Synchronous logic ###### - ############################### - def _calculate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - faithfulness_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() != "no": - faithfulness_count += 1 - - score = faithfulness_count / number_of_verdicts - return 0 if self.strict_mode and score < self.threshold else score - def _generate_reason(self) -> str: if self.include_reason is False: return None @@ -163,9 +123,24 @@ def _generate_reason(self) -> str: res = self.model.generate(prompt) return res - def _generate_verdicts(self) -> List[FaithfulnessVerdict]: + async def _a_generate_verdicts(self) -> List[FaithfulnessVerdict]: + if len(self.claims) == 0: + return [] + verdicts: List[FaithfulnessVerdict] = [] + prompt = FaithfulnessTemplate.generate_verdicts( + claims=self.claims, retrieval_context="\n\n".join(self.truths) + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] + return verdicts + + def _generate_verdicts(self) -> List[FaithfulnessVerdict]: + if len(self.claims) == 0: + return [] + verdicts: List[FaithfulnessVerdict] = [] prompt = FaithfulnessTemplate.generate_verdicts( claims=self.claims, retrieval_context="\n\n".join(self.truths) ) @@ -174,12 +149,26 @@ def _generate_verdicts(self) -> List[FaithfulnessVerdict]: verdicts = [FaithfulnessVerdict(**item) for item in data["verdicts"]] return verdicts + async def _a_generate_truths(self, retrieval_context: str) -> List[str]: + prompt = FaithfulnessTemplate.generate_truths( + text="\n\n".join(retrieval_context) + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + return data["truths"] + def _generate_truths(self, retrieval_context: str) -> List[str]: - prompt = FaithfulnessTemplate.generate_claims( + prompt = FaithfulnessTemplate.generate_truths( text="\n\n".join(retrieval_context) ) res = self.model.generate(prompt) data = trimAndLoadJson(res) + return data["truths"] + + async def _a_generate_claims(self, actual_output: str) -> List[str]: + prompt = FaithfulnessTemplate.generate_claims(text=actual_output) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) return data["claims"] def _generate_claims(self, actual_output: str) -> List[str]: @@ -188,6 +177,19 @@ def _generate_claims(self, actual_output: str) -> List[str]: data = trimAndLoadJson(res) return data["claims"] + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 1 + + faithfulness_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() != "no": + faithfulness_count += 1 + + score = faithfulness_count / number_of_verdicts + return 0 if self.strict_mode and score < self.threshold else score + def is_successful(self) -> bool: self.success = self.score >= self.threshold return self.success diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 4fc37f259..ba8bee310 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -37,20 +37,25 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if ( test_case.input is None or test_case.actual_output is None or test_case.context is None ): raise ValueError("Input, actual output, or context cannot be None") + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -74,7 +79,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): print("a hallucination") @@ -131,21 +136,6 @@ def _generate_reason(self): res = self.model.generate(prompt) return res - def _calculate_score(self) -> float: - number_of_verdicts = len(self.verdicts) - if number_of_verdicts == 0: - return 0 - - hallucination_count = 0 - - for verdict in self.verdicts: - if verdict.verdict.strip().lower() == "no": - hallucination_count += 1 - - score = hallucination_count / number_of_verdicts - - return 1 if self.strict_mode and score > self.threshold else score - async def _a_generate_verdicts( self, actual_output: str, contexts: List[str] ) -> List[HallucinationVerdict]: @@ -170,6 +160,19 @@ def _generate_verdicts( verdicts = [HallucinationVerdict(**item) for item in data["verdicts"]] return verdicts + def _calculate_score(self) -> float: + number_of_verdicts = len(self.verdicts) + if number_of_verdicts == 0: + return 0 + + hallucination_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "no": + hallucination_count += 1 + + score = hallucination_count / number_of_verdicts + return 1 if self.strict_mode and score > self.threshold else score + def is_successful(self) -> bool: self.success = self.score <= self.threshold return self.success diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py index 27d45aef7..adff8b131 100644 --- a/deepeval/metrics/latency.py +++ b/deepeval/metrics/latency.py @@ -1,3 +1,5 @@ +from typing import Optional + from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase from deepeval.telemetry import capture_metric_type @@ -13,7 +15,9 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): self.success = test_case.latency <= self.threshold self.score = test_case.latency capture_metric_type(self.__name__) diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py index 9809af1b4..53fbe15cf 100644 --- a/deepeval/metrics/ragas.py +++ b/deepeval/metrics/ragas.py @@ -27,7 +27,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -91,7 +93,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -153,7 +157,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -212,7 +218,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -267,7 +275,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -323,7 +333,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -380,7 +392,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): try: from ragas import evaluate from ragas.metrics.critique import coherence @@ -435,7 +449,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): try: from ragas import evaluate from ragas.metrics.critique import maliciousness @@ -491,7 +507,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): try: from ragas import evaluate from ragas.metrics.critique import correctness @@ -547,7 +565,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): try: from ragas import evaluate from ragas.metrics.critique import conciseness @@ -605,7 +625,9 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ): # sends to server try: from ragas import evaluate @@ -632,6 +654,8 @@ def measure(self, test_case: LLMTestCase): ), ] + print(metrics) + for metric in metrics: score = metric.measure(test_case) score_breakdown[metric.__name__] = score diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 7f02da7f6..20c4cdd10 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -59,17 +59,22 @@ def __init__( self.n = n self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -104,7 +109,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): print("a summarization") @@ -324,6 +329,9 @@ def _generate_coverage_verdicts( async def _a_generate_alignment_verdicts( self, ) -> List[SummarizationAlignmentVerdict]: + if len(self.claims) == 0: + return [] + verdicts: List[SummarizationAlignmentVerdict] = [] prompt = SummarizationTemplate.generate_alignment_verdicts( summary_claims=self.claims, orignal_text="\n\n".join(self.truths) @@ -338,6 +346,9 @@ async def _a_generate_alignment_verdicts( def _generate_alignment_verdicts( self, ) -> List[SummarizationAlignmentVerdict]: + if len(self.claims) == 0: + return [] + verdicts: List[SummarizationAlignmentVerdict] = [] prompt = SummarizationTemplate.generate_alignment_verdicts( summary_claims=self.claims, orignal_text="\n\n".join(self.truths) diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 5e6383f30..0ac653f56 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -36,17 +36,22 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure(self, test_case: LLMTestCase): + def measure( + self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None + ) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") + asynchronous = ( + _asynchronous if _asynchronous is not None else self.asynchronous + ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + asynchronous, ): - if self.asynchronous: + if asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -70,7 +75,7 @@ async def a_measure( self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + True, _show_indicator, ): self.opinions: List[str] = await self._a_generate_opinions( @@ -99,7 +104,6 @@ async def _a_generate_reason(self) -> str: toxics=toxics, score=format(self.score, ".2f"), ) - res = await self.model.a_generate(prompt) return res @@ -116,24 +120,13 @@ def _generate_reason(self) -> str: toxics=toxics, score=format(self.score, ".2f"), ) - res = self.model.generate(prompt) return res - def _calculate_score(self) -> float: - total = len(self.verdicts) - if total == 0: - return 0 - - toxic_count = 0 - for verdict in self.verdicts: - if verdict.verdict.strip().lower() == "yes": - toxic_count += 1 - - score = toxic_count / total - return 1 if self.strict_mode and score > self.threshold else score - async def _a_generate_verdicts(self) -> List[ToxicityVerdict]: + if len(self.opinions) == 0: + return [] + verdicts: List[ToxicityVerdict] = [] prompt = ToxicityTemplate.generate_verdicts(opinions=self.opinions) res = await self.model.a_generate(prompt) @@ -142,6 +135,9 @@ async def _a_generate_verdicts(self) -> List[ToxicityVerdict]: return verdicts def _generate_verdicts(self) -> List[ToxicityVerdict]: + if len(self.opinions) == 0: + return [] + verdicts: List[ToxicityVerdict] = [] prompt = ToxicityTemplate.generate_verdicts(opinions=self.opinions) res = self.model.generate(prompt) @@ -161,6 +157,19 @@ def _generate_opinions(self, actual_output: str) -> List[str]: data = trimAndLoadJson(res) return data["opinions"] + def _calculate_score(self) -> float: + total = len(self.verdicts) + if total == 0: + return 0 + + toxic_count = 0 + for verdict in self.verdicts: + if verdict.verdict.strip().lower() == "yes": + toxic_count += 1 + + score = toxic_count / total + return 1 if self.strict_mode and score > self.threshold else score + def is_successful(self) -> bool: return self.success From 4aa2b21b35b45c53e2c5774a085056c40abe1e08 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 18:54:00 +0800 Subject: [PATCH 34/59] fix test --- tests/test_custom_metric.py | 3 ++- tests/test_deployment.py | 15 +++++++++------ tests/test_everything.py | 27 ++++++++++++++------------- tests/test_summarization.py | 4 ++-- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/tests/test_custom_metric.py b/tests/test_custom_metric.py index fcabad68e..6a897a6ec 100644 --- a/tests/test_custom_metric.py +++ b/tests/test_custom_metric.py @@ -39,4 +39,5 @@ def test_length_metric(): actual_output="This is a long sentence that is more than 3 letters", latency=8.3, ) - assert_test(test_case, [metric]) + # a_measure not implemented + assert_test(test_case, [metric], asynchronous=False) diff --git a/tests/test_deployment.py b/tests/test_deployment.py index 6fb1e0c09..d7de56318 100644 --- a/tests/test_deployment.py +++ b/tests/test_deployment.py @@ -13,7 +13,7 @@ class FakeMetric(BaseMetric): def __init__(self, threshold: float = 0.5): self.threshold = threshold - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase, _asynchronous): # Set self.success and self.score in the "measure" method self.score = random.uniform(0.0, 1.0) self.success = self.score >= self.threshold @@ -22,6 +22,14 @@ def measure(self, test_case: LLMTestCase): self.reason = "This metric looking good!" return self.score + def a_meausre(self, test_case: LLMTestCase): + self.score = random.uniform(0.0, 1.0) + self.success = self.score >= self.threshold + # You can also optionally set a reason for the score returned. + # This is particularly useful for a score computed using LLMs + self.reason = "This async metric looking good!" + return self.score + def is_successful(self): return self.success @@ -57,9 +65,4 @@ def hyperparameters(): return { "chunk_size": 500, "temperature": 0, - "prompt_template": """You are a helpful assistant, answer the following question in a non-judgemental tone. - - Question: - {question} - """, } diff --git a/tests/test_everything.py b/tests/test_everything.py index 1e3a394b5..d730aa8ef 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -76,13 +76,13 @@ def test_everything(): metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) - # metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) - # metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) - # metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) + metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) + metric5 = ContextualRelevancyMetric(threshold=0.5, strict_mode=strict_mode) metric6 = BiasMetric(threshold=0.5, strict_mode=strict_mode) - # metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) - # metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) - # metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) + metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) + metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) + metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) # metric10 = GEval( # name="Coherence", # criteria="Coherence - determine if the actual output is coherent with the input.", @@ -105,15 +105,16 @@ def test_everything(): [ metric1, metric2, - # metric3, - # metric4, - # metric5, - # metric6, - # metric7, - # metric8, - # metric9, + metric3, + metric4, + metric5, + metric6, + metric7, + metric8, + metric9, # metric10, ], + asynchronous=False, ) diff --git a/tests/test_summarization.py b/tests/test_summarization.py index f330a152a..7a293fbfa 100644 --- a/tests/test_summarization.py +++ b/tests/test_summarization.py @@ -4,7 +4,7 @@ from deepeval.metrics import SummarizationMetric -@pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expdensive") def test_summarization(): metric = SummarizationMetric() @@ -26,4 +26,4 @@ def test_summarization(): test_case = LLMTestCase(input=input, actual_output=output) - assert_test(test_case, [metric]) + assert_test(test_case, [metric], asynchronous=False) From 1c990c9bc8d7ef3194e9cd61871118d62aeed9ea Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 19:05:40 +0800 Subject: [PATCH 35/59] reformat --- deepeval/evaluate.py | 5 +- .../answer_relevancy/answer_relevancy.py | 13 ++---- deepeval/metrics/bias/bias.py | 12 ++--- .../contextual_precision.py | 11 ++--- .../contextual_recall/contextual_recall.py | 12 ++--- .../contextual_relevancy.py | 12 ++--- deepeval/metrics/cost.py | 4 +- deepeval/metrics/faithfulness/faithfulness.py | 12 ++--- .../metrics/hallucination/hallucination.py | 13 ++---- deepeval/metrics/latency.py | 4 +- deepeval/metrics/ragas.py | 46 +++++-------------- .../metrics/summarization/summarization.py | 12 ++--- deepeval/metrics/toxicity/toxicity.py | 11 ++--- tests/test_everything.py | 2 +- 14 files changed, 48 insertions(+), 121 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 55a09c968..739e83498 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -81,7 +81,10 @@ def execute_test_cases( test_start_time = time.perf_counter() for metric in metrics: - metric.measure(test_case, _asynchronous=False) + # Override metric async + metric.asynchronous = False + + metric.measure(test_case) metric_metadata = MetricsMetadata( metric=metric.__name__, score=metric.score, diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index e862eeb11..389e2bf09 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -35,21 +35,16 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -174,8 +169,6 @@ def _generate_statements( return data["statements"] def _calculate_score(self): - print(self.statements) - print(self.verdicts) number_of_verdicts = len(self.verdicts) if number_of_verdicts == 0: return 1 diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 6a74c847c..e026a8a6e 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -35,21 +35,17 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) + with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 2c4f51dec..6e26d566f 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -36,9 +36,7 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -48,17 +46,14 @@ def measure( raise ValueError( "Input, actual output, expected output, or retrieval context cannot be None" ) - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 718529bea..ccd73f834 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -34,9 +34,7 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -46,16 +44,13 @@ def measure( raise ValueError( "Input, actual output, expected output, or retrieval context cannot be None" ) - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -82,7 +77,6 @@ async def a_measure( True, _show_indicator, ): - print("a contextual recall") self.verdicts: List[ContextualRecallVerdict] = ( await self._a_generate_verdicts( test_case.expected_output, test_case.retrieval_context diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 0a074ba92..213c52a65 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -39,9 +39,7 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -50,16 +48,13 @@ def measure( raise ValueError( "Input, actual output, or retrieval context cannot be None" ) - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -86,7 +81,6 @@ async def a_measure( True, _show_indicator, ): - print("a contextual relevancy") self.verdicts: List[ContextualRelevancyVerdict] = ( await self._a_generate_verdicts( test_case.input, test_case.retrieval_context diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py index dfc3310e4..84cb47654 100644 --- a/deepeval/metrics/cost.py +++ b/deepeval/metrics/cost.py @@ -9,9 +9,7 @@ class CostMetric(BaseMetric): def __init__(self, max_cost: float): self.threshold = max_cost - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): self.success = test_case.cost <= self.threshold self.score = test_case.cost capture_metric_type(self.__name__) diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 59427a693..bfff13f81 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -35,9 +35,7 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None @@ -46,16 +44,14 @@ def measure( raise ValueError( "Input, actual output, and retrieval context cannot be None" ) - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) + with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index ba8bee310..766d87eb0 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -37,25 +37,21 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if ( test_case.input is None or test_case.actual_output is None or test_case.context is None ): raise ValueError("Input, actual output, or context cannot be None") - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) + with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -82,7 +78,6 @@ async def a_measure( True, _show_indicator, ): - print("a hallucination") self.verdicts: List[HallucinationVerdict] = ( await self._a_generate_verdicts( test_case.actual_output, test_case.context diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py index adff8b131..9d50d3935 100644 --- a/deepeval/metrics/latency.py +++ b/deepeval/metrics/latency.py @@ -15,9 +15,7 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + async def a_measure(self, test_case: LLMTestCase): self.success = test_case.latency <= self.threshold self.score = test_case.latency capture_metric_type(self.__name__) diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py index 53fbe15cf..9809af1b4 100644 --- a/deepeval/metrics/ragas.py +++ b/deepeval/metrics/ragas.py @@ -27,9 +27,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -93,9 +91,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -157,9 +153,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -218,9 +212,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -275,9 +267,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -333,9 +323,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -392,9 +380,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate from ragas.metrics.critique import coherence @@ -449,9 +435,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate from ragas.metrics.critique import maliciousness @@ -507,9 +491,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate from ragas.metrics.critique import correctness @@ -565,9 +547,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): try: from ragas import evaluate from ragas.metrics.critique import conciseness @@ -625,9 +605,7 @@ def __init__( async def a_measure(self, test_case: LLMTestCase): return self.measure(test_case) - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ): + def measure(self, test_case: LLMTestCase): # sends to server try: from ragas import evaluate @@ -654,8 +632,6 @@ def measure( ), ] - print(metrics) - for metric in metrics: score = metric.measure(test_case) score_breakdown[metric.__name__] = score diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 20c4cdd10..0f8690dd6 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -59,22 +59,17 @@ def __init__( self.n = n self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -112,7 +107,6 @@ async def a_measure( True, _show_indicator, ): - print("a summarization") self.truths, self.claims = await asyncio.gather( self._a_generate_claims(test_case.input), self._a_generate_claims(test_case.actual_output), diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 0ac653f56..282286031 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -36,22 +36,17 @@ def __init__( self.asynchronous = asynchronous self.strict_mode = strict_mode - def measure( - self, test_case: LLMTestCase, _asynchronous: Optional[bool] = None - ) -> float: + def measure(self, test_case: LLMTestCase) -> float: if test_case.input is None or test_case.actual_output is None: raise ValueError("Input or actual output cannot be None") - asynchronous = ( - _asynchronous if _asynchronous is not None else self.asynchronous - ) with metrics_progress_context( self.__name__, self.evaluation_model, self.strict_mode, - asynchronous, + self.asynchronous, ): - if asynchronous: + if self.asynchronous: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/tests/test_everything.py b/tests/test_everything.py index d730aa8ef..c89cac6c8 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -114,7 +114,7 @@ def test_everything(): metric9, # metric10, ], - asynchronous=False, + # asynchronous=False, ) From b735268eeb2c1ea0b2647beea92a4af4e9efcd53 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 19:25:03 +0800 Subject: [PATCH 36/59] valid test case params --- .../answer_relevancy/answer_relevancy.py | 17 ++++++++--- deepeval/metrics/bias/bias.py | 19 +++++++++---- .../contextual_precision.py | 27 ++++++++++-------- .../contextual_recall/contextual_recall.py | 26 +++++++++-------- .../contextual_relevancy.py | 26 +++++++++-------- deepeval/metrics/faithfulness/faithfulness.py | 25 +++++++++-------- .../metrics/hallucination/hallucination.py | 26 +++++++++-------- .../metrics/summarization/summarization.py | 19 +++++++++---- deepeval/metrics/toxicity/toxicity.py | 18 ++++++++---- deepeval/utils.py | 28 ++++++++++++++++++- tests/test_contextual_precision.py | 6 ++-- 11 files changed, 156 insertions(+), 81 deletions(-) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 389e2bf09..aa0bf3a89 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -2,14 +2,23 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop -from deepeval.test_case import LLMTestCase +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] + class AnswerRelvancyVerdict(BaseModel): verdict: str @@ -36,8 +45,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if test_case.input is None or test_case.actual_output is None: - raise ValueError("Input or actual output cannot be None") + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -65,6 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index e026a8a6e..720163233 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -2,14 +2,24 @@ from pydantic import BaseModel, Field from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) from deepeval.metrics.bias.template import BiasTemplate +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] + + # BiasMetric runs a similar algorithm to Dbias: https://arxiv.org/pdf/2208.05777.pdf class BiasVerdict(BaseModel): verdict: str @@ -36,9 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if test_case.input is None or test_case.actual_output is None: - raise ValueError("Input or actual output cannot be None") - + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -64,6 +72,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 6e26d566f..c912d2912 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -1,8 +1,12 @@ from typing import Optional, List, Union from pydantic import BaseModel -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop -from deepeval.test_case import LLMTestCase +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.contextual_precision.template import ( @@ -12,6 +16,14 @@ from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, + LLMTestCaseParams.EXPECTED_OUTPUT, +] + + class ContextualPrecisionVerdict(BaseModel): verdict: str reason: str @@ -37,15 +49,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - or test_case.retrieval_context is None - or test_case.expected_output is None - ): - raise ValueError( - "Input, actual output, expected output, or retrieval context cannot be None" - ) + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -75,6 +79,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index ccd73f834..1992c11b9 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -1,14 +1,25 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop -from deepeval.test_case import LLMTestCase +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, + LLMTestCaseParams.EXPECTED_OUTPUT, +] + class ContextualRecallVerdict(BaseModel): verdict: str @@ -35,15 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - or test_case.retrieval_context is None - or test_case.expected_output is None - ): - raise ValueError( - "Input, actual output, expected output, or retrieval context cannot be None" - ) + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -70,6 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 213c52a65..79d213642 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -1,11 +1,13 @@ import asyncio from typing import Optional, List, Union from pydantic import BaseModel, Field -from threading import Lock -from concurrent.futures import ThreadPoolExecutor, as_completed -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop -from deepeval.test_case import LLMTestCase +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.contextual_relevancy.template import ( @@ -14,6 +16,12 @@ from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, +] + class ContextualRelevancyVerdict(BaseModel): verdict: str @@ -40,14 +48,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - or test_case.retrieval_context is None - ): - raise ValueError( - "Input, actual output, or retrieval context cannot be None" - ) + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -74,6 +75,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index bfff13f81..cf70c5c34 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -2,14 +2,24 @@ from pydantic import BaseModel, Field import asyncio -from deepeval.test_case import LLMTestCase +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.faithfulness.template import FaithfulnessTemplate from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, +] + class FaithfulnessVerdict(BaseModel): verdict: str @@ -36,15 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - or test_case.retrieval_context is None - ): - raise ValueError( - "Input, actual output, and retrieval context cannot be None" - ) - + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -69,6 +71,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 766d87eb0..09e65fd1f 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -1,17 +1,24 @@ -import asyncio from typing import Optional, Union, List -from threading import Lock -from concurrent.futures import ThreadPoolExecutor, as_completed from pydantic import BaseModel, Field -from deepeval.test_case import LLMTestCase +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) from deepeval.metrics.hallucination.template import HallucinationTemplate from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.CONTEXT, +] + class HallucinationVerdict(BaseModel): verdict: str @@ -38,13 +45,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if ( - test_case.input is None - or test_case.actual_output is None - or test_case.context is None - ): - raise ValueError("Input, actual output, or context cannot be None") - + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -71,6 +72,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 0f8690dd6..d24d2b7d6 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -2,17 +2,25 @@ from typing import List, Optional, Union from enum import Enum from pydantic import BaseModel, Field -from concurrent.futures import ThreadPoolExecutor -from deepeval.test_case import LLMTestCase +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) from deepeval.metrics.summarization.template import SummarizationTemplate from deepeval.metrics.faithfulness.template import FaithfulnessTemplate from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] + class SummarizationAlignmentVerdict(BaseModel): # yes, no, or idk @@ -60,9 +68,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if test_case.input is None or test_case.actual_output is None: - raise ValueError("Input or actual output cannot be None") - + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -100,6 +106,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 282286031..863bf4c8b 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -2,14 +2,23 @@ from pydantic import BaseModel, Field from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase +from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.progress_context import metrics_progress_context from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.utils import trimAndLoadJson, get_or_create_event_loop +from deepeval.utils import ( + trimAndLoadJson, + get_or_create_event_loop, + validate_test_case_params, +) from deepeval.metrics.bias.template import BiasTemplate from deepeval.metrics.toxicity.template import ToxicityTemplate +required_params: List[LLMTestCaseParams] = [ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, +] + # ToxicMetric uses similar rubric to decoding trust: https://arxiv.org/abs/2306.11698 class ToxicityVerdict(BaseModel): @@ -37,9 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - if test_case.input is None or test_case.actual_output is None: - raise ValueError("Input or actual output cannot be None") - + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -66,6 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: + validate_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/utils.py b/deepeval/utils.py index 532192f19..92e4e3793 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -3,7 +3,7 @@ import os import json import time -from typing import Any, Optional, Dict +from typing import Any, Optional, Dict, List from collections.abc import Iterable import tqdm import re @@ -14,6 +14,32 @@ import asyncio from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER +from deepeval.test_case import LLMTestCase, LLMTestCaseParams + + +def validate_test_case_params( + test_case: LLMTestCase, + test_case_params: List[LLMTestCaseParams], + metric_name: str, +): + missing_params = [] + for param in test_case_params: + if getattr(test_case, param.value) is None: + missing_params.append(f"'{param.value}'") + + if missing_params: + if len(missing_params) == 1: + missing_params_str = missing_params[0] + elif len(missing_params) == 2: + missing_params_str = " and ".join(missing_params) + else: + missing_params_str = ( + ", ".join(missing_params[:-1]) + ", and " + missing_params[-1] + ) + + raise ValueError( + f"{missing_params_str} cannot be None for the '{metric_name}' metric" + ) def get_or_create_event_loop() -> asyncio.AbstractEventLoop: diff --git a/tests/test_contextual_precision.py b/tests/test_contextual_precision.py index 0916f167a..eb849fdbe 100644 --- a/tests/test_contextual_precision.py +++ b/tests/test_contextual_precision.py @@ -59,13 +59,13 @@ """ -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_contextual_precision(): metric = ContextualPrecisionMetric(threshold=0.5) test_case = LLMTestCase( input=question, actual_output=answer, - expected_output=answer, - retrieval_context=[one, four, two, five, three], + # expected_output=answer, + # retrieval_context=[one, four, two, five, three], ) assert_test(test_case, [metric]) From 3541423fd9cfbe828d907cb4abf60ec979fb7e2c Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 19:26:01 +0800 Subject: [PATCH 37/59] rename --- deepeval/metrics/answer_relevancy/answer_relevancy.py | 6 +++--- deepeval/metrics/bias/bias.py | 6 +++--- .../metrics/contextual_precision/contextual_precision.py | 6 +++--- deepeval/metrics/contextual_recall/contextual_recall.py | 6 +++--- .../metrics/contextual_relevancy/contextual_relevancy.py | 6 +++--- deepeval/metrics/faithfulness/faithfulness.py | 6 +++--- deepeval/metrics/hallucination/hallucination.py | 6 +++--- deepeval/metrics/summarization/summarization.py | 6 +++--- deepeval/metrics/toxicity/toxicity.py | 6 +++--- deepeval/utils.py | 2 +- 10 files changed, 28 insertions(+), 28 deletions(-) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index aa0bf3a89..f733263d9 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -5,7 +5,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -45,7 +45,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -73,7 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 720163233..43fbe9885 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -9,7 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.metrics.bias.template import BiasTemplate @@ -46,7 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -72,7 +72,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index c912d2912..6831adf9f 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -4,7 +4,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -49,7 +49,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -79,7 +79,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 1992c11b9..4e2b49d89 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -4,7 +4,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -46,7 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -73,7 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 79d213642..d5f18207d 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -5,7 +5,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -48,7 +48,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -75,7 +75,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index cf70c5c34..18631c5ee 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -7,7 +7,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.faithfulness.template import FaithfulnessTemplate @@ -46,7 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -71,7 +71,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 09e65fd1f..8be25c449 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -6,7 +6,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.metrics.hallucination.template import HallucinationTemplate from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -45,7 +45,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -72,7 +72,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index d24d2b7d6..762a33ebe 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -9,7 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.metrics.summarization.template import SummarizationTemplate from deepeval.metrics.faithfulness.template import FaithfulnessTemplate @@ -68,7 +68,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -106,7 +106,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 863bf4c8b..80aab9f94 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -9,7 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, get_or_create_event_loop, - validate_test_case_params, + check_test_case_params, ) from deepeval.metrics.bias.template import BiasTemplate from deepeval.metrics.toxicity.template import ToxicityTemplate @@ -46,7 +46,7 @@ def __init__( self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, @@ -73,7 +73,7 @@ def measure(self, test_case: LLMTestCase) -> float: async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: - validate_test_case_params(test_case, required_params, self.__name__) + check_test_case_params(test_case, required_params, self.__name__) with metrics_progress_context( self.__name__, self.evaluation_model, diff --git a/deepeval/utils.py b/deepeval/utils.py index 92e4e3793..4a6562bc7 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -17,7 +17,7 @@ from deepeval.test_case import LLMTestCase, LLMTestCaseParams -def validate_test_case_params( +def check_test_case_params( test_case: LLMTestCase, test_case_params: List[LLMTestCaseParams], metric_name: str, From e0cd2fa5a1b165f9540aa5b968d84a0384befa25 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 19:33:02 +0800 Subject: [PATCH 38/59] updated tests --- tests/test_everything.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_everything.py b/tests/test_everything.py index c89cac6c8..909af211e 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -69,12 +69,12 @@ being composed mostly of rock and metal. """ -strict_mode = False +strict_mode = True @pytest.mark.skip(reason="openai is expensive") def test_everything(): - metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode) + metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode, asynchronous=True) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) @@ -114,7 +114,7 @@ def test_everything(): metric9, # metric10, ], - # asynchronous=False, + asynchronous=False, ) From d68063b26c7fe3ef8916b843580087b13e0e235e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 19:38:07 +0800 Subject: [PATCH 39/59] . --- tests/test_contextual_precision.py | 6 +++--- tests/test_everything.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_contextual_precision.py b/tests/test_contextual_precision.py index eb849fdbe..0916f167a 100644 --- a/tests/test_contextual_precision.py +++ b/tests/test_contextual_precision.py @@ -59,13 +59,13 @@ """ -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_contextual_precision(): metric = ContextualPrecisionMetric(threshold=0.5) test_case = LLMTestCase( input=question, actual_output=answer, - # expected_output=answer, - # retrieval_context=[one, four, two, five, three], + expected_output=answer, + retrieval_context=[one, four, two, five, three], ) assert_test(test_case, [metric]) diff --git a/tests/test_everything.py b/tests/test_everything.py index 909af211e..df7323307 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -74,7 +74,9 @@ @pytest.mark.skip(reason="openai is expensive") def test_everything(): - metric1 = AnswerRelevancyMetric(threshold=0.5, strict_mode=strict_mode, asynchronous=True) + metric1 = AnswerRelevancyMetric( + threshold=0.5, strict_mode=strict_mode, asynchronous=True + ) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) metric4 = ContextualRecallMetric(threshold=0.5, strict_mode=strict_mode) From ea5898c69a86326481d6ea431ba614ad919cc876 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 21:05:43 +0800 Subject: [PATCH 40/59] Added async for geval --- deepeval/metrics/g_eval/g_eval.py | 121 +++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 37 deletions(-) diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index fc89981c9..8e665a3f9 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -8,9 +8,10 @@ evaluation_steps_template, evaluation_results_template, ) -from deepeval.utils import trimAndLoadJson +from deepeval.utils import trimAndLoadJson, check_test_case_params, get_or_create_event_loop from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.telemetry import capture_metric_type +from deepeval.progress_context import metrics_progress_context class GEvalResponse(BaseModel): @@ -27,6 +28,7 @@ def __init__( evaluation_steps: Optional[List[str]] = None, model: Optional[Union[str, DeepEvalBaseLLM]] = None, threshold: float = 0.5, + asynchronous: bool = True, strict_mode: bool = False, ): self.name = name @@ -45,7 +47,7 @@ def __init__( # Check if evaluation_steps is provided, it cannot be an empty list if evaluation_steps is not None and len(evaluation_steps) == 0: raise ValueError( - "Evaluation steps must not be an empty list. Either omit evaluation steps or include a non-empty list of steps." + "'evaluation_steps' must not be an empty list. Either omit evaluation steps or include a non-empty list of steps." ) self.criteria = criteria @@ -57,69 +59,114 @@ def __init__( self.evaluation_steps = evaluation_steps self.threshold = 1 if strict_mode else threshold self.strict_mode = strict_mode + self.asynchronous = asynchronous def measure(self, test_case: LLMTestCase): """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" - - # Measure the test case - for param in self.evaluation_params: - if ( - not hasattr(test_case, param.value) - or getattr(test_case, param.value) is None - ): - raise ValueError( - f"Test case is missing the required attribute: {param.value}" + check_test_case_params( + test_case, self.evaluation_params, f"GEval ({self.__name__})" + ) + with metrics_progress_context( + f"GEval ({self.__name__})", + self.evaluation_model, + self.strict_mode, + self.asynchronous, + ): + if self.asynchronous: + loop = get_or_create_event_loop() + loop.run_until_complete(self.a_measure(test_case, _show_indicator=False)) + else: + self.evaluation_steps: List[str] = ( + self._generate_evaluation_steps() ) + g_score, reason = self.evaluate(test_case) + self.reason = reason + self.score = float(g_score) / 10 + self.score = ( + 0 + if self.strict_mode and self.score < self.threshold + else self.score + ) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score - if self.evaluation_steps is None: - data = trimAndLoadJson(self.generate_evaluation_steps()) - self.evaluation_steps = data["steps"] - - score, reason = self.evaluate(test_case) - self.reason = reason - - score = float(score) / 10 - - self.score = 0 if self.strict_mode and score < self.threshold else score - self.success = score >= self.threshold - capture_metric_type(self.__name__) - return self.score - - def is_successful(self) -> bool: - self.success = self.score >= self.threshold - return self.success + async def a_measure(self, test_case: LLMTestCase, _show_indicator: bool = True): + check_test_case_params( + test_case, self.evaluation_params, f"GEval ({self.__name__})" + ) + with metrics_progress_context( + f"GEval ({self.__name__})", + self.evaluation_model, + self.strict_mode, + True, + _show_indicator, + ): + self.evaluation_steps: List[str] = ( + self._a_generate_evaluation_steps() + ) + g_score, reason = self._a_evaluate(test_case) + self.reason = reason + self.score = float(g_score) / 10 + self.score = ( + 0 + if self.strict_mode and self.score < self.threshold + else self.score + ) + self.success = self.score >= self.threshold + capture_metric_type(self.__name__) + return self.score - def generate_evaluation_steps(self): + def _a_generate_evaluation_steps(self) -> List[str]: prompt: dict = evaluation_steps_template.format(criteria=self.criteria) + res = self.model(prompt) + data = trimAndLoadJson(res) + return data["steps"] + def _generate_evaluation_steps(self) -> List[str]: + prompt: dict = evaluation_steps_template.format(criteria=self.criteria) res = self.model(prompt) + data = trimAndLoadJson(res) + return data["steps"] - return res + async def _a_evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: + text = """""" + for param in self.evaluation_params: + value = getattr(test_case, param.value) + text += f"{param.value}: {value} \n\n" + + prompt: dict = evaluation_results_template.format( + evaluation_steps=self.number_evaluation_steps(), + text=text, + ) + res = await self.model.a_generate(prompt) + data = trimAndLoadJson(res) + return data["score"], data["reason"] def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: text = """""" - for param in self.evaluation_params: value = getattr(test_case, param.value) text += f"{param.value}: {value} \n\n" prompt: dict = evaluation_results_template.format( - evaluation_steps=self.numbered_evaluation_steps(), + evaluation_steps=self.number_evaluation_steps(), text=text, ) - - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) - return data["score"], data["reason"] - def numbered_evaluation_steps(self): + def number_evaluation_steps(self): evaluation_steps = """""" for index, string in enumerate(self.evaluation_steps, start=1): evaluation_steps += f"{index}. {string}\n" - return evaluation_steps + def is_successful(self) -> bool: + self.success = self.score >= self.threshold + return self.success + @property def __name__(self): return self.name From 0a4a88230d3a5b45965e5a9b3e74a447547b6bfd Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 21:42:32 +0800 Subject: [PATCH 41/59] improved G Eval --- deepeval/metrics/g_eval/g_eval.py | 81 +++++++++++++++++++++-------- deepeval/metrics/g_eval/template.py | 17 +++--- tests/test_everything.py | 24 ++++----- tests/test_g_eval.py | 3 +- 4 files changed, 82 insertions(+), 43 deletions(-) diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index 8e665a3f9..100f341af 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -1,18 +1,43 @@ -import json from typing import Optional, List, Tuple, Union from pydantic import BaseModel from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.metrics.g_eval.template import ( - evaluation_steps_template, - evaluation_results_template, +from deepeval.metrics.g_eval.template import GEvalTemplate +from deepeval.utils import ( + trimAndLoadJson, + check_test_case_params, + get_or_create_event_loop, ) -from deepeval.utils import trimAndLoadJson, check_test_case_params, get_or_create_event_loop from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.telemetry import capture_metric_type from deepeval.progress_context import metrics_progress_context +G_EVAL_PARAMS = { + LLMTestCaseParams.INPUT: "Input", + LLMTestCaseParams.ACTUAL_OUTPUT: "Actual Output", + LLMTestCaseParams.EXPECTED_OUTPUT: "Expected Output", + LLMTestCaseParams.CONTEXT: "Context", + LLMTestCaseParams.RETRIEVAL_CONTEXT: "Retrieval Context", +} + + +def construct_g_eval_params_string( + llm_test_case_params: List[LLMTestCaseParams], +): + g_eval_params = [G_EVAL_PARAMS[param] for param in llm_test_case_params] + + if len(g_eval_params) == 1: + g_eval_params_str = g_eval_params[0] + elif len(g_eval_params) == 2: + g_eval_params_str = " and ".join(g_eval_params) + else: + g_eval_params_str = ( + ", ".join(g_eval_params[:-1]) + ", and " + g_eval_params[-1] + ) + + return g_eval_params_str + class GEvalResponse(BaseModel): score: float @@ -61,10 +86,10 @@ def __init__( self.strict_mode = strict_mode self.asynchronous = asynchronous - def measure(self, test_case: LLMTestCase): + def measure(self, test_case: LLMTestCase) -> float: """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" check_test_case_params( - test_case, self.evaluation_params, f"GEval ({self.__name__})" + test_case, self.evaluation_params, f"GEval({self.__name__})" ) with metrics_progress_context( f"GEval ({self.__name__})", @@ -74,7 +99,9 @@ def measure(self, test_case: LLMTestCase): ): if self.asynchronous: loop = get_or_create_event_loop() - loop.run_until_complete(self.a_measure(test_case, _show_indicator=False)) + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.evaluation_steps: List[str] = ( self._generate_evaluation_steps() @@ -91,21 +118,23 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase, _show_indicator: bool = True): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ) -> float: check_test_case_params( - test_case, self.evaluation_params, f"GEval ({self.__name__})" + test_case, self.evaluation_params, f"GEval({self.__name__})" ) with metrics_progress_context( - f"GEval ({self.__name__})", + f"GEval({self.__name__})", self.evaluation_model, self.strict_mode, True, _show_indicator, ): self.evaluation_steps: List[str] = ( - self._a_generate_evaluation_steps() + await self._a_generate_evaluation_steps() ) - g_score, reason = self._a_evaluate(test_case) + g_score, reason = await self._a_evaluate(test_case) self.reason = reason self.score = float(g_score) / 10 self.score = ( @@ -117,15 +146,25 @@ async def a_measure(self, test_case: LLMTestCase, _show_indicator: bool = True): capture_metric_type(self.__name__) return self.score - def _a_generate_evaluation_steps(self) -> List[str]: - prompt: dict = evaluation_steps_template.format(criteria=self.criteria) - res = self.model(prompt) + async def _a_generate_evaluation_steps(self) -> List[str]: + g_eval_params_str = construct_g_eval_params_string( + self.evaluation_params + ) + prompt = GEvalTemplate.generate_evaluation_steps( + criteria=self.criteria, parameters=g_eval_params_str + ) + res = await self.model.a_generate(prompt) data = trimAndLoadJson(res) return data["steps"] def _generate_evaluation_steps(self) -> List[str]: - prompt: dict = evaluation_steps_template.format(criteria=self.criteria) - res = self.model(prompt) + g_eval_params_str = construct_g_eval_params_string( + self.evaluation_params + ) + prompt = GEvalTemplate.generate_evaluation_steps( + criteria=self.criteria, parameters=g_eval_params_str + ) + res = self.model.generate(prompt) data = trimAndLoadJson(res) return data["steps"] @@ -133,9 +172,9 @@ async def _a_evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: text = """""" for param in self.evaluation_params: value = getattr(test_case, param.value) - text += f"{param.value}: {value} \n\n" + text += f"{G_EVAL_PARAMS[param]}:\n{value} \n\n" - prompt: dict = evaluation_results_template.format( + prompt = GEvalTemplate.generate_evaluation_results( evaluation_steps=self.number_evaluation_steps(), text=text, ) @@ -149,7 +188,7 @@ def evaluate(self, test_case: LLMTestCase) -> Tuple[int, str]: value = getattr(test_case, param.value) text += f"{param.value}: {value} \n\n" - prompt: dict = evaluation_results_template.format( + prompt = GEvalTemplate.generate_evaluation_results( evaluation_steps=self.number_evaluation_steps(), text=text, ) diff --git a/deepeval/metrics/g_eval/template.py b/deepeval/metrics/g_eval/template.py index cc729b709..80e328fca 100644 --- a/deepeval/metrics/g_eval/template.py +++ b/deepeval/metrics/g_eval/template.py @@ -1,8 +1,9 @@ -# TODO: LLMEvalTemplate -evaluation_steps_template = """ -You will be given 4 blocks of text labelled "Input", "Actual output", "Expected output", and "Context". Generate 3-4 concise evaluation steps based on the criteria below. Explicitly state to ignore any blocks of text that is not mentioned in the evaluation criteria. +class GEvalTemplate: + @staticmethod + def generate_evaluation_steps(parameters, criteria): + return f"""Given an evaluation criteria which outlines how you should judge the {parameters}, generate 3-4 concise evaluation steps based on the criteria below. You MUST make it clear how to evaluate {parameters} in relation to one another. -Criteria: +Evaluation Criteria: {criteria} ** @@ -12,15 +13,15 @@ JSON: """ -evaluation_results_template = """ + @staticmethod + def generate_evaluation_results(evaluation_steps, text): + return f"""Given th evaluation steps, return a JSON with two keys: 1) a `score` key ranging from 0 - 10, with 10 being that it follows the criteria and 0 being that it does not, and 2) a `reason` key, a reason for the given score. + Evaluation Steps: {evaluation_steps} -Text: {text} -Given the evaluation steps, please evaluate the provided Text. Some fields in text might be unavailable and will be labelled "N/A". Only return a JSON with two keys: 1) a `score` key ranging from 0 - 10, with 10 being that it follows the criteria and 0 being that it does not, and 2) a `reason` key, a reason for the given score. Be extra harsh and give as low a score as possible as it designed to penalize. - ** IMPORTANT: Please make sure to only return in JSON format, with the "score" and "reason" key. No words or explaination is needed. ** diff --git a/tests/test_everything.py b/tests/test_everything.py index df7323307..433919979 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -69,7 +69,7 @@ being composed mostly of rock and metal. """ -strict_mode = True +strict_mode = False @pytest.mark.skip(reason="openai is expensive") @@ -85,15 +85,16 @@ def test_everything(): metric7 = ToxicityMetric(threshold=0.5, strict_mode=strict_mode) metric8 = HallucinationMetric(threshold=0.5, strict_mode=strict_mode) metric9 = SummarizationMetric(threshold=0.5, strict_mode=strict_mode) - # metric10 = GEval( - # name="Coherence", - # criteria="Coherence - determine if the actual output is coherent with the input.", - # evaluation_params=[ - # LLMTestCaseParams.INPUT, - # LLMTestCaseParams.ACTUAL_OUTPUT, - # ], - # strict_mode=True, - # ) + metric10 = GEval( + name="Coherence", + criteria="Coherence - determine if the actual output is coherent with the input, and does not contradict anything in the retrieval context.", + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.RETRIEVAL_CONTEXT, + ], + strict_mode=strict_mode, + ) test_case = LLMTestCase( input=question, @@ -114,9 +115,8 @@ def test_everything(): metric7, metric8, metric9, - # metric10, + metric10, ], - asynchronous=False, ) diff --git a/tests/test_g_eval.py b/tests/test_g_eval.py index aa10d3adc..3f95f1b88 100644 --- a/tests/test_g_eval.py +++ b/tests/test_g_eval.py @@ -21,5 +21,4 @@ def test_g_eval(): expected_output="Paris", context=["Geography"], ) - - assert_test(test_case, [metric]) + assert_test(test_case, [metric], asynchronous=False) From 4015360e76df326c94cfc88ab8021939ef4bded5 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Fri, 8 Mar 2024 21:55:14 +0800 Subject: [PATCH 42/59] fix KR --- deepeval/evaluate.py | 10 +++++----- deepeval/metrics/answer_relevancy/answer_relevancy.py | 8 ++++---- deepeval/metrics/base_metric.py | 4 ++-- deepeval/metrics/bias/bias.py | 8 ++++---- .../contextual_precision/contextual_precision.py | 8 ++++---- .../metrics/contextual_recall/contextual_recall.py | 8 ++++---- .../contextual_relevancy/contextual_relevancy.py | 8 ++++---- deepeval/metrics/faithfulness/faithfulness.py | 8 ++++---- deepeval/metrics/hallucination/hallucination.py | 8 ++++---- .../metrics/knowledge_retention/knowledge_retention.py | 6 +++--- deepeval/metrics/summarization/summarization.py | 8 ++++---- deepeval/metrics/toxicity/toxicity.py | 8 ++++---- deepeval/progress_context.py | 4 ++-- tests/test_custom_metric.py | 2 +- tests/test_deployment.py | 2 +- tests/test_everything.py | 2 +- tests/test_faithfulness.py | 2 +- tests/test_g_eval.py | 2 +- tests/test_summarization.py | 2 +- 19 files changed, 54 insertions(+), 54 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 739e83498..ab47133af 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -82,7 +82,7 @@ def execute_test_cases( for metric in metrics: # Override metric async - metric.asynchronous = False + metric.run_async = False metric.measure(test_case) metric_metadata = MetricsMetadata( @@ -188,7 +188,7 @@ def run_test( def assert_test( - test_case: LLMTestCase, metrics: List[BaseMetric], asynchronous: bool = True + test_case: LLMTestCase, metrics: List[BaseMetric], run_async: bool = True ): # TODO: refactor for metric in metrics: @@ -199,7 +199,7 @@ def assert_test( if not isinstance(test_case, LLMTestCase): raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") - if asynchronous: + if run_async: loop = get_or_create_event_loop() test_result = loop.run_until_complete( a_execute_test_cases( @@ -228,7 +228,7 @@ def assert_test( def evaluate( test_cases: List[LLMTestCase], metrics: List[BaseMetric], - asynchronous: bool = True, + run_async: bool = True, ): # TODO: refactor for metric in metrics: @@ -244,7 +244,7 @@ def evaluate( test_run_manager.reset() with progress_context("Evaluating testcases..."): - if asynchronous: + if run_async: loop = get_or_create_event_loop() test_results = loop.run_until_complete( a_execute_test_cases(test_cases, metrics, True) diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index f733263d9..a17d19a3f 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -31,7 +31,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -41,7 +41,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -50,9 +50,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py index 981e66083..7cd6d3212 100644 --- a/deepeval/metrics/base_metric.py +++ b/deepeval/metrics/base_metric.py @@ -10,7 +10,7 @@ class BaseMetric: reason: Optional[str] = None evaluation_model: Optional[str] = None strict_mode: bool = False - asynchronous: Optional[bool] = None + run_async: Optional[bool] = None @property def threshold(self) -> float: @@ -27,7 +27,7 @@ def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: @abstractmethod async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: raise NotImplementedError( - f"Async execution for {self.__class__.__name__} not supported yet. Please turn set 'asynchronous' to 'False'." + f"Async execution for {self.__class__.__name__} not supported yet. Please turn set 'run_async' to 'False'." ) @abstractmethod diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 43fbe9885..f8d77388a 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -42,7 +42,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -51,9 +51,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 6831adf9f..172b6bc86 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -35,7 +35,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -45,7 +45,7 @@ def __init__( else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -54,10 +54,10 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 4e2b49d89..dd674aa12 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -42,7 +42,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -51,9 +51,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index d5f18207d..fe9b19610 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -34,7 +34,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -44,7 +44,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -53,9 +53,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 18631c5ee..f1c094bba 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -42,7 +42,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -51,9 +51,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 8be25c449..931984f1d 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -31,7 +31,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = False, + run_async: bool = False, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -41,7 +41,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -50,9 +50,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index f6428cf49..4ef8753a3 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -77,7 +77,7 @@ def _generate_reason(self, score: float) -> str: score=format(score, ".2f"), ) - res = self.model(prompt) + res = self.model.generate(prompt) return res def _calculate_score(self) -> float: @@ -105,7 +105,7 @@ def _generate_verdicts( llm_message=message.actual_output, previous_knowledge=previous_knowledge, ) - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) verdict = KnowledgeRetentionVerdict(index=index, **data) verdicts.append(verdict) @@ -128,7 +128,7 @@ def _generate_knowledges( previous_knowledge=previous_knowledge, ) - res = self.model(prompt) + res = self.model.generate(prompt) data = trimAndLoadJson(res) knowledge = Knowledge(data=data) knowledges.append(knowledge) diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 762a33ebe..e11dc70b2 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -47,7 +47,7 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, assessment_questions: Optional[List[str]] = None, include_reason: bool = True, - asynchronous=True, + run_async=True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -62,7 +62,7 @@ def __init__( else: self.assessment_questions = assessment_questions - self.asynchronous = asynchronous + self.run_async = run_async self.include_reason = include_reason self.n = n self.strict_mode = strict_mode @@ -73,9 +73,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 80aab9f94..63ec7abfc 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -42,7 +42,7 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.asynchronous = asynchronous + self.run_async = run_async self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: @@ -51,9 +51,9 @@ def measure(self, test_case: LLMTestCase) -> float: self.__name__, self.evaluation_model, self.strict_mode, - self.asynchronous, + self.run_async, ): - if self.asynchronous: + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) diff --git a/deepeval/progress_context.py b/deepeval/progress_context.py index 52d59a471..f978d871f 100644 --- a/deepeval/progress_context.py +++ b/deepeval/progress_context.py @@ -24,12 +24,12 @@ def metrics_progress_context( metric_name: str, evaluation_model: str, strict_mode: bool, - asynchronous: bool, + run_async: bool, show_indicator: bool = True, total: int = 9999, transient: bool = True, ): - description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict={strict_mode}, async={asynchronous})! This may take a minute..." + description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict={strict_mode}, async={run_async})! This may take a minute..." console = Console(file=sys.stderr) # Direct output to standard error if show_indicator: with Progress( diff --git a/tests/test_custom_metric.py b/tests/test_custom_metric.py index 6a897a6ec..1b7b94663 100644 --- a/tests/test_custom_metric.py +++ b/tests/test_custom_metric.py @@ -40,4 +40,4 @@ def test_length_metric(): latency=8.3, ) # a_measure not implemented - assert_test(test_case, [metric], asynchronous=False) + assert_test(test_case, [metric], run_async=False) diff --git a/tests/test_deployment.py b/tests/test_deployment.py index d7de56318..9ea659b46 100644 --- a/tests/test_deployment.py +++ b/tests/test_deployment.py @@ -13,7 +13,7 @@ class FakeMetric(BaseMetric): def __init__(self, threshold: float = 0.5): self.threshold = threshold - def measure(self, test_case: LLMTestCase, _asynchronous): + def measure(self, test_case: LLMTestCase, _run_async): # Set self.success and self.score in the "measure" method self.score = random.uniform(0.0, 1.0) self.success = self.score >= self.threshold diff --git a/tests/test_everything.py b/tests/test_everything.py index 433919979..5d9bf770e 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -75,7 +75,7 @@ @pytest.mark.skip(reason="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( - threshold=0.5, strict_mode=strict_mode, asynchronous=True + threshold=0.5, strict_mode=strict_mode, run_async=True ) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index 8c693a334..8c4af8365 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -46,5 +46,5 @@ def test_faithfulness(): actual_output=output, retrieval_context=[one, two, three], ) - metric = FaithfulnessMetric(asynchronous=False) + metric = FaithfulnessMetric(run_async=False) assert_test(test_case, [metric]) diff --git a/tests/test_g_eval.py b/tests/test_g_eval.py index 3f95f1b88..04f793649 100644 --- a/tests/test_g_eval.py +++ b/tests/test_g_eval.py @@ -21,4 +21,4 @@ def test_g_eval(): expected_output="Paris", context=["Geography"], ) - assert_test(test_case, [metric], asynchronous=False) + assert_test(test_case, [metric], run_async=False) diff --git a/tests/test_summarization.py b/tests/test_summarization.py index 7a293fbfa..9ca9b3e2f 100644 --- a/tests/test_summarization.py +++ b/tests/test_summarization.py @@ -26,4 +26,4 @@ def test_summarization(): test_case = LLMTestCase(input=input, actual_output=output) - assert_test(test_case, [metric], asynchronous=False) + assert_test(test_case, [metric], run_async=False) From 137d7adba0c0a8a6ecde1fd0425f5c3811050105 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 03:46:44 +0800 Subject: [PATCH 43/59] added new spinner --- deepeval/__init__.py | 3 +- deepeval/evaluate.py | 74 +- .../answer_relevancy/answer_relevancy.py | 17 +- deepeval/metrics/bias/bias.py | 19 +- .../contextual_precision.py | 19 +- .../contextual_recall/contextual_recall.py | 19 +- .../contextual_relevancy.py | 19 +- deepeval/metrics/cost.py | 4 +- deepeval/metrics/faithfulness/faithfulness.py | 17 +- deepeval/metrics/g_eval/g_eval.py | 27 +- .../metrics/hallucination/hallucination.py | 17 +- deepeval/metrics/indicator.py | 91 ++ .../knowledge_retention.py | 8 +- deepeval/metrics/latency.py | 4 +- deepeval/metrics/ragas.py | 44 +- .../metrics/summarization/summarization.py | 19 +- deepeval/metrics/toxicity/toxicity.py | 17 +- deepeval/progress_context.py | 25 - deepeval/utils.py | 14 + poetry.lock | 1213 +++++++++-------- tests/test_everything.py | 1 + 21 files changed, 875 insertions(+), 796 deletions(-) create mode 100644 deepeval/metrics/indicator.py diff --git a/deepeval/__init__.py b/deepeval/__init__.py index 5d44fdc9c..3067c2644 100644 --- a/deepeval/__init__.py +++ b/deepeval/__init__.py @@ -5,7 +5,7 @@ from ._version import __version__ from deepeval.event import track -from deepeval.evaluate import evaluate, run_test, assert_test +from deepeval.evaluate import evaluate, assert_test from deepeval.test_run import on_test_run_end, log_hyperparameters from deepeval.utils import login_with_confident_api_key from deepeval.telemetry import * @@ -15,7 +15,6 @@ "log_hyperparameters", "track", "evaluate", - "run_test", "assert_test", "on_test_run_end", ] diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index ab47133af..346eb95ed 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -6,13 +6,15 @@ from deepeval.utils import drop_and_copy, get_or_create_event_loop from deepeval.telemetry import capture_evaluation_count -from deepeval.progress_context import progress_context from deepeval.metrics import BaseMetric +from deepeval.metrics.indicator import ( + measure_metrics_with_indicator, +) from deepeval.test_case import LLMTestCase from deepeval.tracing import get_trace_stack from deepeval.constants import PYTEST_RUN_TEST_NAME from deepeval.test_run import test_run_manager, APITestCase, MetricsMetadata -from deepeval.utils import get_is_running_deepeval +from deepeval.utils import get_is_running_deepeval, disable_indicator @dataclass @@ -127,11 +129,7 @@ async def a_execute_test_cases( api_test_case: APITestCase = create_api_test_case(test_case, index) test_start_time = time.perf_counter() - # Run metrics concurrently using asyncio.gather - await asyncio.gather( - *[metric.a_measure(test_case) for metric in metrics] - ) - + await measure_metrics_with_indicator(metrics, test_case) for metric in metrics: metric_metadata = MetricsMetadata( metric=metric.__name__, @@ -164,29 +162,6 @@ async def a_execute_test_cases( return test_results -def run_test( - test_case: LLMTestCase, - metrics: List[BaseMetric], -) -> List[TestResult]: - # TODO: refactor - for metric in metrics: - if not isinstance(metric, BaseMetric): - raise TypeError("Provided 'metric' must be of type 'BaseMetric'.") - - # TODO: refactor - if not isinstance(test_case, LLMTestCase): - raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") - - test_run_manager.reset() - with progress_context("Executing run_test()..."): - test_result = execute_test_cases([test_case], metrics, False)[0] - capture_evaluation_count() - print_test_result(test_result) - print("") - print("-" * 70) - return test_result - - def assert_test( test_case: LLMTestCase, metrics: List[BaseMetric], run_async: bool = True ): @@ -218,7 +193,7 @@ def assert_test( ] failed_metrics_str = ", ".join( [ - f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold})" + f"{metric.__name__} (score: {metric.score}, threshold: {metric.threshold}, strict: {metric.strict_mode})" for metric in failed_metrics ] ) @@ -229,7 +204,11 @@ def evaluate( test_cases: List[LLMTestCase], metrics: List[BaseMetric], run_async: bool = True, + show_indicator: bool = True, ): + if show_indicator is False: + disable_indicator() + # TODO: refactor for metric in metrics: if not isinstance(metric, BaseMetric): @@ -243,22 +222,23 @@ def evaluate( ) test_run_manager.reset() - with progress_context("Evaluating testcases..."): - if run_async: - loop = get_or_create_event_loop() - test_results = loop.run_until_complete( - a_execute_test_cases(test_cases, metrics, True) - ) - else: - test_results = execute_test_cases(test_cases, metrics, True) - capture_evaluation_count() - for test_result in test_results: - print_test_result(test_result) - print("") - print("-" * 70) - - test_run_manager.wrap_up_test_run(display_table=False) - return test_results + print("Evaluating test cases...") + if run_async: + loop = get_or_create_event_loop() + test_results = loop.run_until_complete( + a_execute_test_cases(test_cases, metrics, True) + ) + else: + test_results = execute_test_cases(test_cases, metrics, True) + + capture_evaluation_count() + for test_result in test_results: + print_test_result(test_result) + + print("") + print("-" * 70) + test_run_manager.wrap_up_test_run(display_table=False) + return test_results def print_test_result(test_result: TestResult): diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index a17d19a3f..692c4caba 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -11,7 +11,7 @@ from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -46,12 +46,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -74,12 +69,8 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, is_async=True, _show_indicator=_show_indicator ): self.statements: List[str] = await self._a_generate_statements( test_case.actual_output diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index f8d77388a..80347905c 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -3,7 +3,7 @@ from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.utils import ( @@ -47,12 +47,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -73,12 +68,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.opinions: List[str] = await self._a_generate_opinions( test_case.actual_output diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 172b6bc86..229a044f6 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -12,7 +12,7 @@ from deepeval.metrics.contextual_precision.template import ( ContextualPrecisionTemplate, ) -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type @@ -50,12 +50,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() @@ -80,12 +75,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.verdicts: List[ContextualPrecisionVerdict] = ( await self._a_generate_verdicts( diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index dd674aa12..baf039416 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -10,7 +10,7 @@ from deepeval.metrics import BaseMetric from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.contextual_recall.template import ContextualRecallTemplate -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -47,12 +47,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -74,12 +69,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.verdicts: List[ContextualRecallVerdict] = ( await self._a_generate_verdicts( diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index fe9b19610..3409b42bd 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -13,7 +13,7 @@ from deepeval.metrics.contextual_relevancy.template import ( ContextualRelevancyTemplate, ) -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -49,12 +49,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -76,12 +71,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.verdicts: List[ContextualRelevancyVerdict] = ( await self._a_generate_verdicts( diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py index 84cb47654..5aa06c717 100644 --- a/deepeval/metrics/cost.py +++ b/deepeval/metrics/cost.py @@ -15,7 +15,9 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): self.success = test_case.cost <= self.threshold self.score = test_case.cost capture_metric_type(self.__name__) diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index f1c094bba..ccdcb1fe1 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -11,7 +11,7 @@ ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.faithfulness.template import FaithfulnessTemplate -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -47,12 +47,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -72,12 +67,8 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, is_async=True, _show_indicator=_show_indicator ): self.truths, self.claims = await asyncio.gather( self._a_generate_truths(test_case.retrieval_context), diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index 100f341af..7a017b5f4 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -11,7 +11,7 @@ ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.telemetry import capture_metric_type -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator G_EVAL_PARAMS = { LLMTestCaseParams.INPUT: "Input", @@ -53,7 +53,7 @@ def __init__( evaluation_steps: Optional[List[str]] = None, model: Optional[Union[str, DeepEvalBaseLLM]] = None, threshold: float = 0.5, - asynchronous: bool = True, + run_async: bool = True, strict_mode: bool = False, ): self.name = name @@ -84,20 +84,15 @@ def __init__( self.evaluation_steps = evaluation_steps self.threshold = 1 if strict_mode else threshold self.strict_mode = strict_mode - self.asynchronous = asynchronous + self.run_async = run_async def measure(self, test_case: LLMTestCase) -> float: """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" check_test_case_params( test_case, self.evaluation_params, f"GEval({self.__name__})" ) - with metrics_progress_context( - f"GEval ({self.__name__})", - self.evaluation_model, - self.strict_mode, - self.asynchronous, - ): - if self.asynchronous: + with metric_progress_indicator(self): + if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( self.a_measure(test_case, _show_indicator=False) @@ -124,12 +119,10 @@ async def a_measure( check_test_case_params( test_case, self.evaluation_params, f"GEval({self.__name__})" ) - with metrics_progress_context( - f"GEval({self.__name__})", - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.evaluation_steps: List[str] = ( await self._a_generate_evaluation_steps() @@ -208,4 +201,4 @@ def is_successful(self) -> bool: @property def __name__(self): - return self.name + return f"GEval({self.name})" diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 931984f1d..0e09c42e9 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -10,7 +10,7 @@ ) from deepeval.metrics.hallucination.template import HallucinationTemplate from deepeval.models import GPTModel, DeepEvalBaseLLM -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -46,12 +46,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -73,12 +68,8 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, is_async=True, _show_indicator=_show_indicator ): self.verdicts: List[HallucinationVerdict] = ( await self._a_generate_verdicts( diff --git a/deepeval/metrics/indicator.py b/deepeval/metrics/indicator.py new file mode 100644 index 000000000..8e1b84324 --- /dev/null +++ b/deepeval/metrics/indicator.py @@ -0,0 +1,91 @@ +import asyncio +from rich.console import Console +from rich.progress import Progress, SpinnerColumn, TextColumn +from contextlib import contextmanager +import sys +from typing import List, Optional +import time + +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase +from deepeval.utils import show_indicator + + +def format_metric_description( + metric: BaseMetric, is_async: Optional[bool] = None +): + if is_async is None: + run_async = metric.run_async + else: + run_async = is_async + + return f"✨ You're running DeepEval's latest [rgb(106,0,255)]{metric.__name__} Metric[/rgb(106,0,255)]! [rgb(55,65,81)](using {metric.evaluation_model}, strict={metric.strict_mode})...[/rgb(55,65,81)]" + + +@contextmanager +def metric_progress_indicator( + metric: BaseMetric, + is_async: Optional[bool] = None, + _show_indicator: bool = True, + total: int = 9999, + transient: bool = True, +): + console = Console(file=sys.stderr) # Direct output to standard error + if _show_indicator and show_indicator(): + with Progress( + SpinnerColumn(style="rgb(106,0,255)"), + TextColumn("[progress.description]{task.description}"), + console=console, # Use the custom console + transient=transient, + ) as progress: + progress.add_task( + description=format_metric_description(metric, is_async), + total=total, + ) + yield + else: + yield + + +async def measure_metric_task( + task_id, progress, metric: BaseMetric, test_case: LLMTestCase +): + while not progress.finished: + start_time = time.perf_counter() + await metric.a_measure(test_case, _show_indicator=False) + end_time = time.perf_counter() + time_taken = format(end_time - start_time, ".2f") + progress.update(task_id, advance=100) + progress.update( + task_id, + description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done! ({time_taken}s)", + ) + break + + +async def measure_metrics_with_indicator( + metrics: List[BaseMetric], + test_case: LLMTestCase, +): + if show_indicator(): + with Progress( + SpinnerColumn(style="rgb(106,0,255)"), + TextColumn("[progress.description]{task.description}"), + transient=False, + ) as progress: + tasks = [] + for metric in metrics: + task_id = progress.add_task( + description=format_metric_description(metric), total=100 + ) + tasks.append( + measure_metric_task(task_id, progress, metric, test_case) + ) + await asyncio.gather(*tasks) + else: + await asyncio.gather( + *[ + metric.a_measure(test_case, _show_indicator=False) + for metric in metrics + ] + ) diff --git a/deepeval/metrics/knowledge_retention/knowledge_retention.py b/deepeval/metrics/knowledge_retention/knowledge_retention.py index 4ef8753a3..977d0336d 100644 --- a/deepeval/metrics/knowledge_retention/knowledge_retention.py +++ b/deepeval/metrics/knowledge_retention/knowledge_retention.py @@ -8,7 +8,7 @@ from deepeval.metrics.knowledge_retention.template import ( KnowledgeRetentionTemplate, ) -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type @@ -43,11 +43,7 @@ def measure(self, test_case: ConversationalTestCase): if len(test_case.messages) == 0: raise ValueError("Messages cannot be empty") - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - ): + with metric_progress_indicator(self): self.knowledges: List[Knowledge] = self._generate_knowledges( test_case ) diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py index 9d50d3935..3c9ce6aa3 100644 --- a/deepeval/metrics/latency.py +++ b/deepeval/metrics/latency.py @@ -15,7 +15,9 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): self.success = test_case.latency <= self.threshold self.score = test_case.latency capture_metric_type(self.__name__) diff --git a/deepeval/metrics/ragas.py b/deepeval/metrics/ragas.py index 9809af1b4..5c6dc3c6b 100644 --- a/deepeval/metrics/ragas.py +++ b/deepeval/metrics/ragas.py @@ -65,7 +65,9 @@ def measure(self, test_case: LLMTestCase): capture_metric_type(self.__name__) return self.score - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def is_successful(self): @@ -88,7 +90,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -150,7 +154,9 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.embeddings = embeddings - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -209,7 +215,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -264,7 +272,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -320,7 +330,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -377,7 +389,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -432,7 +446,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -488,7 +504,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -544,7 +562,9 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): @@ -602,7 +622,9 @@ def __init__( self.evaluation_model = self.model.get_model_name() self.embeddings = embeddings - async def a_measure(self, test_case: LLMTestCase): + async def a_measure( + self, test_case: LLMTestCase, _show_indicator: bool = True + ): return self.measure(test_case) def measure(self, test_case: LLMTestCase): diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index e11dc70b2..6fb0cbfe0 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -13,7 +13,7 @@ ) from deepeval.metrics.summarization.template import SummarizationTemplate from deepeval.metrics.faithfulness.template import FaithfulnessTemplate -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type required_params: List[LLMTestCaseParams] = [ @@ -69,12 +69,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -107,12 +102,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, + is_async=True, + _show_indicator=_show_indicator, ): self.truths, self.claims = await asyncio.gather( self._a_generate_claims(test_case.input), diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 63ec7abfc..f6d250691 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -3,7 +3,7 @@ from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams -from deepeval.progress_context import metrics_progress_context +from deepeval.metrics.indicator import metric_progress_indicator from deepeval.telemetry import capture_metric_type from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.utils import ( @@ -47,12 +47,7 @@ def __init__( def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - self.run_async, - ): + with metric_progress_indicator(self): if self.run_async: loop = get_or_create_event_loop() loop.run_until_complete( @@ -74,12 +69,8 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metrics_progress_context( - self.__name__, - self.evaluation_model, - self.strict_mode, - True, - _show_indicator, + with metric_progress_indicator( + self, is_async=True, _show_indicator=_show_indicator ): self.opinions: List[str] = await self._a_generate_opinions( test_case.actual_output diff --git a/deepeval/progress_context.py b/deepeval/progress_context.py index f978d871f..997dfbdc5 100644 --- a/deepeval/progress_context.py +++ b/deepeval/progress_context.py @@ -19,31 +19,6 @@ def progress_context( yield -@contextmanager -def metrics_progress_context( - metric_name: str, - evaluation_model: str, - strict_mode: bool, - run_async: bool, - show_indicator: bool = True, - total: int = 9999, - transient: bool = True, -): - description = f"✨ 🍰 ✨ You're using DeepEval's latest {metric_name} Metric (using {evaluation_model}, strict={strict_mode}, async={run_async})! This may take a minute..." - console = Console(file=sys.stderr) # Direct output to standard error - if show_indicator: - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - console=console, # Use the custom console - transient=transient, - ) as progress: - progress.add_task(description=description, total=total) - yield - else: - yield - - @contextmanager def synthesizer_progress_context( evaluation_model: str, diff --git a/deepeval/utils.py b/deepeval/utils.py index 4a6562bc7..3765d9f07 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -17,6 +17,20 @@ from deepeval.test_case import LLMTestCase, LLMTestCaseParams +def show_indicator(): + try: + if os.environ["DISABLE_DEEPEVAL_INDICATOR"] == "YES": + return False + else: + return True + except: + return True + + +def disable_indicator(): + os.environ["DISABLE_DEEPEVAL_INDICATOR"] = "YES" + + def check_test_case_params( test_case: LLMTestCase, test_case_params: List[LLMTestCaseParams], diff --git a/poetry.lock b/poetry.lock index b1db9b1f1..fce84d4ca 100644 --- a/poetry.lock +++ b/poetry.lock @@ -335,17 +335,18 @@ beautifulsoup4 = "*" [[package]] name = "build" -version = "1.0.3" +version = "1.1.1" description = "A simple, correct Python build frontend" optional = false python-versions = ">= 3.7" files = [ - {file = "build-1.0.3-py3-none-any.whl", hash = "sha256:589bf99a67df7c9cf07ec0ac0e5e2ea5d4b37ac63301c4986d1acb126aa83f8f"}, - {file = "build-1.0.3.tar.gz", hash = "sha256:538aab1b64f9828977f84bc63ae570b060a8ed1be419e7870b8b4fc5e6ea553b"}, + {file = "build-1.1.1-py3-none-any.whl", hash = "sha256:8ed0851ee76e6e38adce47e4bee3b51c771d86c64cf578d0c2245567ee200e73"}, + {file = "build-1.1.1.tar.gz", hash = "sha256:8eea65bb45b1aac2e734ba2cc8dad3a6d97d97901a395bd0ed3e7b46953d2a31"}, ] [package.dependencies] colorama = {version = "*", markers = "os_name == \"nt\""} +importlib-metadata = {version = ">=4.6", markers = "python_full_version < \"3.10.2\""} packaging = ">=19.0" pyproject_hooks = "*" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} @@ -358,13 +359,13 @@ virtualenv = ["virtualenv (>=20.0.35)"] [[package]] name = "cachetools" -version = "5.3.2" +version = "5.3.3" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.2-py3-none-any.whl", hash = "sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1"}, - {file = "cachetools-5.3.2.tar.gz", hash = "sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2"}, + {file = "cachetools-5.3.3-py3-none-any.whl", hash = "sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945"}, + {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"}, ] [[package]] @@ -516,13 +517,13 @@ numpy = "*" [[package]] name = "chromadb" -version = "0.4.22" +version = "0.4.24" description = "Chroma." optional = false python-versions = ">=3.8" files = [ - {file = "chromadb-0.4.22-py3-none-any.whl", hash = "sha256:ad210b27b4cda2f09d15adc9c83c81bfa66b69f39648a27b637306e40de0680d"}, - {file = "chromadb-0.4.22.tar.gz", hash = "sha256:c793149e1c2bbbb52d77602c6c0594c5752f04cd9be12619250ddad2082af27a"}, + {file = "chromadb-0.4.24-py3-none-any.whl", hash = "sha256:3a08e237a4ad28b5d176685bd22429a03717fe09d35022fb230d516108da01da"}, + {file = "chromadb-0.4.24.tar.gz", hash = "sha256:a5c80b4e4ad9b236ed2d4899a5b9e8002b489293f2881cb2cadab5b199ee1c72"}, ] [package.dependencies] @@ -540,6 +541,7 @@ opentelemetry-api = ">=1.2.0" opentelemetry-exporter-otlp-proto-grpc = ">=1.2.0" opentelemetry-instrumentation-fastapi = ">=0.41b0" opentelemetry-sdk = ">=1.2.0" +orjson = ">=3.9.12" overrides = ">=7.3.1" posthog = ">=2.4.0" pulsar-client = ">=3.1.0" @@ -598,63 +600,63 @@ cron = ["capturer (>=2.4)"] [[package]] name = "coverage" -version = "7.4.2" +version = "7.4.3" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf54c3e089179d9d23900e3efc86d46e4431188d9a657f345410eecdd0151f50"}, - {file = "coverage-7.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fe6e43c8b510719b48af7db9631b5fbac910ade4bd90e6378c85ac5ac706382c"}, - {file = "coverage-7.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b98c89db1b150d851a7840142d60d01d07677a18f0f46836e691c38134ed18b"}, - {file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5f9683be6a5b19cd776ee4e2f2ffb411424819c69afab6b2db3a0a364ec6642"}, - {file = "coverage-7.4.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78cdcbf7b9cb83fe047ee09298e25b1cd1636824067166dc97ad0543b079d22f"}, - {file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2599972b21911111114100d362aea9e70a88b258400672626efa2b9e2179609c"}, - {file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ef00d31b7569ed3cb2036f26565f1984b9fc08541731ce01012b02a4c238bf03"}, - {file = "coverage-7.4.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:20a875bfd8c282985c4720c32aa05056f77a68e6d8bbc5fe8632c5860ee0b49b"}, - {file = "coverage-7.4.2-cp310-cp310-win32.whl", hash = "sha256:b3f2b1eb229f23c82898eedfc3296137cf1f16bb145ceab3edfd17cbde273fb7"}, - {file = "coverage-7.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7df95fdd1432a5d2675ce630fef5f239939e2b3610fe2f2b5bf21fa505256fa3"}, - {file = "coverage-7.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a8ddbd158e069dded57738ea69b9744525181e99974c899b39f75b2b29a624e2"}, - {file = "coverage-7.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81a5fb41b0d24447a47543b749adc34d45a2cf77b48ca74e5bf3de60a7bd9edc"}, - {file = "coverage-7.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2412e98e70f16243be41d20836abd5f3f32edef07cbf8f407f1b6e1ceae783ac"}, - {file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddb79414c15c6f03f56cc68fa06994f047cf20207c31b5dad3f6bab54a0f66ef"}, - {file = "coverage-7.4.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf89ab85027427d351f1de918aff4b43f4eb5f33aff6835ed30322a86ac29c9e"}, - {file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a178b7b1ac0f1530bb28d2e51f88c0bab3e5949835851a60dda80bff6052510c"}, - {file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:06fe398145a2e91edaf1ab4eee66149c6776c6b25b136f4a86fcbbb09512fd10"}, - {file = "coverage-7.4.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:18cac867950943fe93d6cd56a67eb7dcd2d4a781a40f4c1e25d6f1ed98721a55"}, - {file = "coverage-7.4.2-cp311-cp311-win32.whl", hash = "sha256:f72cdd2586f9a769570d4b5714a3837b3a59a53b096bb954f1811f6a0afad305"}, - {file = "coverage-7.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:d779a48fac416387dd5673fc5b2d6bd903ed903faaa3247dc1865c65eaa5a93e"}, - {file = "coverage-7.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:adbdfcda2469d188d79771d5696dc54fab98a16d2ef7e0875013b5f56a251047"}, - {file = "coverage-7.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ac4bab32f396b03ebecfcf2971668da9275b3bb5f81b3b6ba96622f4ef3f6e17"}, - {file = "coverage-7.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:006d220ba2e1a45f1de083d5022d4955abb0aedd78904cd5a779b955b019ec73"}, - {file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3733545eb294e5ad274abe131d1e7e7de4ba17a144505c12feca48803fea5f64"}, - {file = "coverage-7.4.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42a9e754aa250fe61f0f99986399cec086d7e7a01dd82fd863a20af34cbce962"}, - {file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:2ed37e16cf35c8d6e0b430254574b8edd242a367a1b1531bd1adc99c6a5e00fe"}, - {file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b953275d4edfab6cc0ed7139fa773dfb89e81fee1569a932f6020ce7c6da0e8f"}, - {file = "coverage-7.4.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32b4ab7e6c924f945cbae5392832e93e4ceb81483fd6dc4aa8fb1a97b9d3e0e1"}, - {file = "coverage-7.4.2-cp312-cp312-win32.whl", hash = "sha256:f5df76c58977bc35a49515b2fbba84a1d952ff0ec784a4070334dfbec28a2def"}, - {file = "coverage-7.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:34423abbaad70fea9d0164add189eabaea679068ebdf693baa5c02d03e7db244"}, - {file = "coverage-7.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5b11f9c6587668e495cc7365f85c93bed34c3a81f9f08b0920b87a89acc13469"}, - {file = "coverage-7.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:51593a1f05c39332f623d64d910445fdec3d2ac2d96b37ce7f331882d5678ddf"}, - {file = "coverage-7.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69f1665165ba2fe7614e2f0c1aed71e14d83510bf67e2ee13df467d1c08bf1e8"}, - {file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3c8bbb95a699c80a167478478efe5e09ad31680931ec280bf2087905e3b95ec"}, - {file = "coverage-7.4.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:175f56572f25e1e1201d2b3e07b71ca4d201bf0b9cb8fad3f1dfae6a4188de86"}, - {file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8562ca91e8c40864942615b1d0b12289d3e745e6b2da901d133f52f2d510a1e3"}, - {file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:d9a1ef0f173e1a19738f154fb3644f90d0ada56fe6c9b422f992b04266c55d5a"}, - {file = "coverage-7.4.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f40ac873045db4fd98a6f40387d242bde2708a3f8167bd967ccd43ad46394ba2"}, - {file = "coverage-7.4.2-cp38-cp38-win32.whl", hash = "sha256:d1b750a8409bec61caa7824bfd64a8074b6d2d420433f64c161a8335796c7c6b"}, - {file = "coverage-7.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:b4ae777bebaed89e3a7e80c4a03fac434a98a8abb5251b2a957d38fe3fd30088"}, - {file = "coverage-7.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3ff7f92ae5a456101ca8f48387fd3c56eb96353588e686286f50633a611afc95"}, - {file = "coverage-7.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:861d75402269ffda0b33af94694b8e0703563116b04c681b1832903fac8fd647"}, - {file = "coverage-7.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3507427d83fa961cbd73f11140f4a5ce84208d31756f7238d6257b2d3d868405"}, - {file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bf711d517e21fb5bc429f5c4308fbc430a8585ff2a43e88540264ae87871e36a"}, - {file = "coverage-7.4.2-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c00e54f0bd258ab25e7f731ca1d5144b0bf7bec0051abccd2bdcff65fa3262c9"}, - {file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f8e845d894e39fb53834da826078f6dc1a933b32b1478cf437007367efaf6f6a"}, - {file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:840456cb1067dc350af9080298c7c2cfdddcedc1cb1e0b30dceecdaf7be1a2d3"}, - {file = "coverage-7.4.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c11ca2df2206a4e3e4c4567f52594637392ed05d7c7fb73b4ea1c658ba560265"}, - {file = "coverage-7.4.2-cp39-cp39-win32.whl", hash = "sha256:3ff5bdb08d8938d336ce4088ca1a1e4b6c8cd3bef8bb3a4c0eb2f37406e49643"}, - {file = "coverage-7.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:ac9e95cefcf044c98d4e2c829cd0669918585755dd9a92e28a1a7012322d0a95"}, - {file = "coverage-7.4.2-pp38.pp39.pp310-none-any.whl", hash = "sha256:f593a4a90118d99014517c2679e04a4ef5aee2d81aa05c26c734d271065efcb6"}, - {file = "coverage-7.4.2.tar.gz", hash = "sha256:1a5ee18e3a8d766075ce9314ed1cb695414bae67df6a4b0805f5137d93d6f1cb"}, + {file = "coverage-7.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8580b827d4746d47294c0e0b92854c85a92c2227927433998f0d3320ae8a71b6"}, + {file = "coverage-7.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:718187eeb9849fc6cc23e0d9b092bc2348821c5e1a901c9f8975df0bc785bfd4"}, + {file = "coverage-7.4.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:767b35c3a246bcb55b8044fd3a43b8cd553dd1f9f2c1eeb87a302b1f8daa0524"}, + {file = "coverage-7.4.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae7f19afe0cce50039e2c782bff379c7e347cba335429678450b8fe81c4ef96d"}, + {file = "coverage-7.4.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba3a8aaed13770e970b3df46980cb068d1c24af1a1968b7818b69af8c4347efb"}, + {file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ee866acc0861caebb4f2ab79f0b94dbfbdbfadc19f82e6e9c93930f74e11d7a0"}, + {file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:506edb1dd49e13a2d4cac6a5173317b82a23c9d6e8df63efb4f0380de0fbccbc"}, + {file = "coverage-7.4.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd6545d97c98a192c5ac995d21c894b581f1fd14cf389be90724d21808b657e2"}, + {file = "coverage-7.4.3-cp310-cp310-win32.whl", hash = "sha256:f6a09b360d67e589236a44f0c39218a8efba2593b6abdccc300a8862cffc2f94"}, + {file = "coverage-7.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:18d90523ce7553dd0b7e23cbb28865db23cddfd683a38fb224115f7826de78d0"}, + {file = "coverage-7.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbbe5e739d45a52f3200a771c6d2c7acf89eb2524890a4a3aa1a7fa0695d2a47"}, + {file = "coverage-7.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:489763b2d037b164846ebac0cbd368b8a4ca56385c4090807ff9fad817de4113"}, + {file = "coverage-7.4.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:451f433ad901b3bb00184d83fd83d135fb682d780b38af7944c9faeecb1e0bfe"}, + {file = "coverage-7.4.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fcc66e222cf4c719fe7722a403888b1f5e1682d1679bd780e2b26c18bb648cdc"}, + {file = "coverage-7.4.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3ec74cfef2d985e145baae90d9b1b32f85e1741b04cd967aaf9cfa84c1334f3"}, + {file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:abbbd8093c5229c72d4c2926afaee0e6e3140de69d5dcd918b2921f2f0c8baba"}, + {file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:35eb581efdacf7b7422af677b92170da4ef34500467381e805944a3201df2079"}, + {file = "coverage-7.4.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8249b1c7334be8f8c3abcaaa996e1e4927b0e5a23b65f5bf6cfe3180d8ca7840"}, + {file = "coverage-7.4.3-cp311-cp311-win32.whl", hash = "sha256:cf30900aa1ba595312ae41978b95e256e419d8a823af79ce670835409fc02ad3"}, + {file = "coverage-7.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:18c7320695c949de11a351742ee001849912fd57e62a706d83dfc1581897fa2e"}, + {file = "coverage-7.4.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b51bfc348925e92a9bd9b2e48dad13431b57011fd1038f08316e6bf1df107d10"}, + {file = "coverage-7.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d6cdecaedea1ea9e033d8adf6a0ab11107b49571bbb9737175444cea6eb72328"}, + {file = "coverage-7.4.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b2eccb883368f9e972e216c7b4c7c06cabda925b5f06dde0650281cb7666a30"}, + {file = "coverage-7.4.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c00cdc8fa4e50e1cc1f941a7f2e3e0f26cb2a1233c9696f26963ff58445bac7"}, + {file = "coverage-7.4.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9a4a8dd3dcf4cbd3165737358e4d7dfbd9d59902ad11e3b15eebb6393b0446e"}, + {file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:062b0a75d9261e2f9c6d071753f7eef0fc9caf3a2c82d36d76667ba7b6470003"}, + {file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:ebe7c9e67a2d15fa97b77ea6571ce5e1e1f6b0db71d1d5e96f8d2bf134303c1d"}, + {file = "coverage-7.4.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:c0a120238dd71c68484f02562f6d446d736adcc6ca0993712289b102705a9a3a"}, + {file = "coverage-7.4.3-cp312-cp312-win32.whl", hash = "sha256:37389611ba54fd6d278fde86eb2c013c8e50232e38f5c68235d09d0a3f8aa352"}, + {file = "coverage-7.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:d25b937a5d9ffa857d41be042b4238dd61db888533b53bc76dc082cb5a15e914"}, + {file = "coverage-7.4.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:28ca2098939eabab044ad68850aac8f8db6bf0b29bc7f2887d05889b17346454"}, + {file = "coverage-7.4.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:280459f0a03cecbe8800786cdc23067a8fc64c0bd51dc614008d9c36e1659d7e"}, + {file = "coverage-7.4.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c0cdedd3500e0511eac1517bf560149764b7d8e65cb800d8bf1c63ebf39edd2"}, + {file = "coverage-7.4.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a9babb9466fe1da12417a4aed923e90124a534736de6201794a3aea9d98484e"}, + {file = "coverage-7.4.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dec9de46a33cf2dd87a5254af095a409ea3bf952d85ad339751e7de6d962cde6"}, + {file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:16bae383a9cc5abab9bb05c10a3e5a52e0a788325dc9ba8499e821885928968c"}, + {file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2c854ce44e1ee31bda4e318af1dbcfc929026d12c5ed030095ad98197eeeaed0"}, + {file = "coverage-7.4.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ce8c50520f57ec57aa21a63ea4f325c7b657386b3f02ccaedeccf9ebe27686e1"}, + {file = "coverage-7.4.3-cp38-cp38-win32.whl", hash = "sha256:708a3369dcf055c00ddeeaa2b20f0dd1ce664eeabde6623e516c5228b753654f"}, + {file = "coverage-7.4.3-cp38-cp38-win_amd64.whl", hash = "sha256:1bf25fbca0c8d121a3e92a2a0555c7e5bc981aee5c3fdaf4bb7809f410f696b9"}, + {file = "coverage-7.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b253094dbe1b431d3a4ac2f053b6d7ede2664ac559705a704f621742e034f1f"}, + {file = "coverage-7.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:77fbfc5720cceac9c200054b9fab50cb2a7d79660609200ab83f5db96162d20c"}, + {file = "coverage-7.4.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6679060424faa9c11808598504c3ab472de4531c571ab2befa32f4971835788e"}, + {file = "coverage-7.4.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4af154d617c875b52651dd8dd17a31270c495082f3d55f6128e7629658d63765"}, + {file = "coverage-7.4.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8640f1fde5e1b8e3439fe482cdc2b0bb6c329f4bb161927c28d2e8879c6029ee"}, + {file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:69b9f6f66c0af29642e73a520b6fed25ff9fd69a25975ebe6acb297234eda501"}, + {file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0842571634f39016a6c03e9d4aba502be652a6e4455fadb73cd3a3a49173e38f"}, + {file = "coverage-7.4.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a78ed23b08e8ab524551f52953a8a05d61c3a760781762aac49f8de6eede8c45"}, + {file = "coverage-7.4.3-cp39-cp39-win32.whl", hash = "sha256:c0524de3ff096e15fcbfe8f056fdb4ea0bf497d584454f344d59fce069d3e6e9"}, + {file = "coverage-7.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:0209a6369ccce576b43bb227dc8322d8ef9e323d089c6f3f26a597b09cb4d2aa"}, + {file = "coverage-7.4.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:7cbde573904625509a3f37b6fecea974e363460b556a627c60dc2f47e2fffa51"}, + {file = "coverage-7.4.3.tar.gz", hash = "sha256:276f6077a5c61447a48d133ed13e759c09e62aff0dc84274a68dc18660104d52"}, ] [package.extras] @@ -677,20 +679,20 @@ typing-inspect = ">=0.4.0,<1" [[package]] name = "datasets" -version = "2.17.1" +version = "2.18.0" description = "HuggingFace community-driven open-source library of datasets" optional = false python-versions = ">=3.8.0" files = [ - {file = "datasets-2.17.1-py3-none-any.whl", hash = "sha256:346974daf2fe9c14ddb35646896b2308b95e7dc27709d1a6e25273573b140cf8"}, - {file = "datasets-2.17.1.tar.gz", hash = "sha256:66ec24077807f374f379b62ab0256c4dcb7c38a57ff1529a22993e8d95f2f9f1"}, + {file = "datasets-2.18.0-py3-none-any.whl", hash = "sha256:f1bbf0e2896917a914de01cbd37075b14deea3837af87ad0d9f697388ccaeb50"}, + {file = "datasets-2.18.0.tar.gz", hash = "sha256:cdf8b8c6abf7316377ba4f49f9589a4c74556d6b481afd0abd2284f3d69185cb"}, ] [package.dependencies] aiohttp = "*" dill = ">=0.3.0,<0.3.9" filelock = "*" -fsspec = {version = ">=2023.1.0,<=2023.10.0", extras = ["http"]} +fsspec = {version = ">=2023.1.0,<=2024.2.0", extras = ["http"]} huggingface-hub = ">=0.19.4" multiprocess = "*" numpy = ">=1.17" @@ -707,11 +709,11 @@ xxhash = "*" apache-beam = ["apache-beam (>=2.26.0)"] audio = ["librosa", "soundfile (>=0.12.1)"] benchmarks = ["tensorflow (==2.12.0)", "torch (==2.0.1)", "transformers (==4.30.1)"] -dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.1.5)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] +dev = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch (<8.0.0)", "faiss-cpu (>=1.6.4)", "jax (>=0.3.14)", "jaxlib (>=0.3.14)", "joblib (<1.3.0)", "joblibspark", "librosa", "lz4", "py7zr", "pyspark (>=3.4)", "pytest", "pytest-datadir", "pytest-xdist", "rarfile (>=4.0)", "ruff (>=0.3.0)", "s3fs", "s3fs (>=2021.11.1)", "soundfile (>=0.12.1)", "sqlalchemy", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow (>=2.3,!=2.6.0,!=2.6.1)", "tensorflow-macos", "tiktoken", "torch", "torch (>=2.0.0)", "transformers", "typing-extensions (>=4.6.1)", "zstandard"] docs = ["s3fs", "tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos", "torch", "transformers"] jax = ["jax (>=0.3.14)", "jaxlib (>=0.3.14)"] metrics-tests = ["Werkzeug (>=1.0.1)", "accelerate", "bert-score (>=0.3.6)", "jiwer", "langdetect", "mauve-text", "nltk", "requests-file (>=1.5.1)", "rouge-score", "sacrebleu", "sacremoses", "scikit-learn", "scipy", "sentencepiece", "seqeval", "six (>=1.15.0,<1.16.0)", "spacy (>=3.0.0)", "texttable (>=1.6.3)", "tldextract", "tldextract (>=3.1.0)", "toml (>=0.10.1)", "typer (<0.5.0)"] -quality = ["ruff (>=0.1.5)"] +quality = ["ruff (>=0.3.0)"] s3 = ["s3fs"] tensorflow = ["tensorflow (>=2.2.0,!=2.6.0,!=2.6.1)", "tensorflow-macos"] tensorflow-gpu = ["tensorflow-gpu (>=2.2.0,!=2.6.0,!=2.6.1)"] @@ -803,13 +805,13 @@ testing = ["hatch", "pre-commit", "pytest", "tox"] [[package]] name = "fastapi" -version = "0.109.2" +version = "0.110.0" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.8" files = [ - {file = "fastapi-0.109.2-py3-none-any.whl", hash = "sha256:2c9bab24667293b501cad8dd388c05240c850b58ec5876ee3283c47d6e1e3a4d"}, - {file = "fastapi-0.109.2.tar.gz", hash = "sha256:f3817eac96fe4f65a2ebb4baa000f394e55f5fccdaf7f75250804bc58f354f73"}, + {file = "fastapi-0.110.0-py3-none-any.whl", hash = "sha256:87a1f6fb632a218222c5984be540055346a8f5d8a68e8f6fb647b1dc9934de4b"}, + {file = "fastapi-0.110.0.tar.gz", hash = "sha256:266775f0dcc95af9d3ef39bad55cff525329a931d5fd51930aadd4f428bf7ff3"}, ] [package.dependencies] @@ -838,13 +840,13 @@ typing = ["typing-extensions (>=4.8)"] [[package]] name = "flatbuffers" -version = "23.5.26" +version = "24.3.7" description = "The FlatBuffers serialization format for Python" optional = false python-versions = "*" files = [ - {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, - {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, + {file = "flatbuffers-24.3.7-py2.py3-none-any.whl", hash = "sha256:80c4f5dcad0ee76b7e349671a0d657f2fbba927a0244f88dd3f5ed6a3694e1fc"}, + {file = "flatbuffers-24.3.7.tar.gz", hash = "sha256:0895c22b9a6019ff2f4de2e5e2f7cd15914043e6e7033a94c0c6369422690f22"}, ] [[package]] @@ -935,18 +937,17 @@ files = [ [[package]] name = "fsspec" -version = "2023.10.0" +version = "2024.2.0" description = "File-system specification" optional = false python-versions = ">=3.8" files = [ - {file = "fsspec-2023.10.0-py3-none-any.whl", hash = "sha256:346a8f024efeb749d2a5fca7ba8854474b1ff9af7c3faaf636a4548781136529"}, - {file = "fsspec-2023.10.0.tar.gz", hash = "sha256:330c66757591df346ad3091a53bd907e15348c2ba17d63fd54f5c39c4457d2a5"}, + {file = "fsspec-2024.2.0-py3-none-any.whl", hash = "sha256:817f969556fa5916bc682e02ca2045f96ff7f586d45110fcb76022063ad2c7d8"}, + {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"}, ] [package.dependencies] aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional = true, markers = "extra == \"http\""} -requests = {version = "*", optional = true, markers = "extra == \"http\""} [package.extras] abfs = ["adlfs"] @@ -963,7 +964,7 @@ github = ["requests"] gs = ["gcsfs"] gui = ["panel"] hdfs = ["pyarrow (>=1)"] -http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"] +http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)"] libarchive = ["libarchive-c"] oci = ["ocifs"] s3 = ["s3fs"] @@ -974,13 +975,13 @@ tqdm = ["tqdm"] [[package]] name = "google-auth" -version = "2.28.0" +version = "2.28.2" description = "Google Authentication Library" optional = false python-versions = ">=3.7" files = [ - {file = "google-auth-2.28.0.tar.gz", hash = "sha256:3cfc1b6e4e64797584fb53fc9bd0b7afa9b7c0dba2004fa7dcc9349e58cc3195"}, - {file = "google_auth-2.28.0-py2.py3-none-any.whl", hash = "sha256:7634d29dcd1e101f5226a23cbc4a0c6cda6394253bf80e281d9c5c6797869c53"}, + {file = "google-auth-2.28.2.tar.gz", hash = "sha256:80b8b4969aa9ed5938c7828308f20f035bc79f9d8fb8120bf9dc8db20b41ba30"}, + {file = "google_auth-2.28.2-py2.py3-none-any.whl", hash = "sha256:9fd67bbcd40f16d9d42f950228e9cf02a2ded4ae49198b27432d0cded5a74c38"}, ] [package.dependencies] @@ -1085,69 +1086,69 @@ test = ["objgraph", "psutil"] [[package]] name = "grpcio" -version = "1.60.1" +version = "1.62.1" description = "HTTP/2-based RPC framework" optional = false python-versions = ">=3.7" files = [ - {file = "grpcio-1.60.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092"}, - {file = "grpcio-1.60.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216"}, - {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525"}, - {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104"}, - {file = "grpcio-1.60.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2"}, - {file = "grpcio-1.60.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0"}, - {file = "grpcio-1.60.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb"}, - {file = "grpcio-1.60.1-cp310-cp310-win32.whl", hash = "sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1"}, - {file = "grpcio-1.60.1-cp310-cp310-win_amd64.whl", hash = "sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177"}, - {file = "grpcio-1.60.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303"}, - {file = "grpcio-1.60.1-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87"}, - {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c"}, - {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03"}, - {file = "grpcio-1.60.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7"}, - {file = "grpcio-1.60.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2"}, - {file = "grpcio-1.60.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce"}, - {file = "grpcio-1.60.1-cp311-cp311-win32.whl", hash = "sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd"}, - {file = "grpcio-1.60.1-cp311-cp311-win_amd64.whl", hash = "sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c"}, - {file = "grpcio-1.60.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:b58b855d0071575ea9c7bc0d84a06d2edfbfccec52e9657864386381a7ce1ae9"}, - {file = "grpcio-1.60.1-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:a731ac5cffc34dac62053e0da90f0c0b8560396a19f69d9703e88240c8f05858"}, - {file = "grpcio-1.60.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:cf77f8cf2a651fbd869fbdcb4a1931464189cd210abc4cfad357f1cacc8642a6"}, - {file = "grpcio-1.60.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c557e94e91a983e5b1e9c60076a8fd79fea1e7e06848eb2e48d0ccfb30f6e073"}, - {file = "grpcio-1.60.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:069fe2aeee02dfd2135d562d0663fe70fbb69d5eed6eb3389042a7e963b54de8"}, - {file = "grpcio-1.60.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb0af13433dbbd1c806e671d81ec75bd324af6ef75171fd7815ca3074fe32bfe"}, - {file = "grpcio-1.60.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2f44c32aef186bbba254129cea1df08a20be414144ac3bdf0e84b24e3f3b2e05"}, - {file = "grpcio-1.60.1-cp312-cp312-win32.whl", hash = "sha256:a212e5dea1a4182e40cd3e4067ee46be9d10418092ce3627475e995cca95de21"}, - {file = "grpcio-1.60.1-cp312-cp312-win_amd64.whl", hash = "sha256:6e490fa5f7f5326222cb9f0b78f207a2b218a14edf39602e083d5f617354306f"}, - {file = "grpcio-1.60.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:4216e67ad9a4769117433814956031cb300f85edc855252a645a9a724b3b6594"}, - {file = "grpcio-1.60.1-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:73e14acd3d4247169955fae8fb103a2b900cfad21d0c35f0dcd0fdd54cd60367"}, - {file = "grpcio-1.60.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:6ecf21d20d02d1733e9c820fb5c114c749d888704a7ec824b545c12e78734d1c"}, - {file = "grpcio-1.60.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:33bdea30dcfd4f87b045d404388469eb48a48c33a6195a043d116ed1b9a0196c"}, - {file = "grpcio-1.60.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53b69e79d00f78c81eecfb38f4516080dc7f36a198b6b37b928f1c13b3c063e9"}, - {file = "grpcio-1.60.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:39aa848794b887120b1d35b1b994e445cc028ff602ef267f87c38122c1add50d"}, - {file = "grpcio-1.60.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:72153a0d2e425f45b884540a61c6639436ddafa1829a42056aa5764b84108b8e"}, - {file = "grpcio-1.60.1-cp37-cp37m-win_amd64.whl", hash = "sha256:50d56280b482875d1f9128ce596e59031a226a8b84bec88cb2bf76c289f5d0de"}, - {file = "grpcio-1.60.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:6d140bdeb26cad8b93c1455fa00573c05592793c32053d6e0016ce05ba267549"}, - {file = "grpcio-1.60.1-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:bc808924470643b82b14fe121923c30ec211d8c693e747eba8a7414bc4351a23"}, - {file = "grpcio-1.60.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:70c83bb530572917be20c21f3b6be92cd86b9aecb44b0c18b1d3b2cc3ae47df0"}, - {file = "grpcio-1.60.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b106bc52e7f28170e624ba61cc7dc6829566e535a6ec68528f8e1afbed1c41f"}, - {file = "grpcio-1.60.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30e980cd6db1088c144b92fe376747328d5554bc7960ce583ec7b7d81cd47287"}, - {file = "grpcio-1.60.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0c5807e9152eff15f1d48f6b9ad3749196f79a4a050469d99eecb679be592acc"}, - {file = "grpcio-1.60.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f1c3dc536b3ee124e8b24feb7533e5c70b9f2ef833e3b2e5513b2897fd46763a"}, - {file = "grpcio-1.60.1-cp38-cp38-win32.whl", hash = "sha256:d7404cebcdb11bb5bd40bf94131faf7e9a7c10a6c60358580fe83913f360f929"}, - {file = "grpcio-1.60.1-cp38-cp38-win_amd64.whl", hash = "sha256:c8754c75f55781515a3005063d9a05878b2cfb3cb7e41d5401ad0cf19de14872"}, - {file = "grpcio-1.60.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8"}, - {file = "grpcio-1.60.1-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73"}, - {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc"}, - {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a"}, - {file = "grpcio-1.60.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180"}, - {file = "grpcio-1.60.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff"}, - {file = "grpcio-1.60.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6"}, - {file = "grpcio-1.60.1-cp39-cp39-win32.whl", hash = "sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804"}, - {file = "grpcio-1.60.1-cp39-cp39-win_amd64.whl", hash = "sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904"}, - {file = "grpcio-1.60.1.tar.gz", hash = "sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962"}, + {file = "grpcio-1.62.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:179bee6f5ed7b5f618844f760b6acf7e910988de77a4f75b95bbfaa8106f3c1e"}, + {file = "grpcio-1.62.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:48611e4fa010e823ba2de8fd3f77c1322dd60cb0d180dc6630a7e157b205f7ea"}, + {file = "grpcio-1.62.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:b2a0e71b0a2158aa4bce48be9f8f9eb45cbd17c78c7443616d00abbe2a509f6d"}, + {file = "grpcio-1.62.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbe80577c7880911d3ad65e5ecc997416c98f354efeba2f8d0f9112a67ed65a5"}, + {file = "grpcio-1.62.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58f6c693d446964e3292425e1d16e21a97a48ba9172f2d0df9d7b640acb99243"}, + {file = "grpcio-1.62.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:77c339403db5a20ef4fed02e4d1a9a3d9866bf9c0afc77a42234677313ea22f3"}, + {file = "grpcio-1.62.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b5a4ea906db7dec694098435d84bf2854fe158eb3cd51e1107e571246d4d1d70"}, + {file = "grpcio-1.62.1-cp310-cp310-win32.whl", hash = "sha256:4187201a53f8561c015bc745b81a1b2d278967b8de35f3399b84b0695e281d5f"}, + {file = "grpcio-1.62.1-cp310-cp310-win_amd64.whl", hash = "sha256:844d1f3fb11bd1ed362d3fdc495d0770cfab75761836193af166fee113421d66"}, + {file = "grpcio-1.62.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:833379943d1728a005e44103f17ecd73d058d37d95783eb8f0b28ddc1f54d7b2"}, + {file = "grpcio-1.62.1-cp311-cp311-macosx_10_10_universal2.whl", hash = "sha256:c7fcc6a32e7b7b58f5a7d27530669337a5d587d4066060bcb9dee7a8c833dfb7"}, + {file = "grpcio-1.62.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:fa7d28eb4d50b7cbe75bb8b45ed0da9a1dc5b219a0af59449676a29c2eed9698"}, + {file = "grpcio-1.62.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48f7135c3de2f298b833be8b4ae20cafe37091634e91f61f5a7eb3d61ec6f660"}, + {file = "grpcio-1.62.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71f11fd63365ade276c9d4a7b7df5c136f9030e3457107e1791b3737a9b9ed6a"}, + {file = "grpcio-1.62.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4b49fd8fe9f9ac23b78437da94c54aa7e9996fbb220bac024a67469ce5d0825f"}, + {file = "grpcio-1.62.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:482ae2ae78679ba9ed5752099b32e5fe580443b4f798e1b71df412abf43375db"}, + {file = "grpcio-1.62.1-cp311-cp311-win32.whl", hash = "sha256:1faa02530b6c7426404372515fe5ddf66e199c2ee613f88f025c6f3bd816450c"}, + {file = "grpcio-1.62.1-cp311-cp311-win_amd64.whl", hash = "sha256:5bd90b8c395f39bc82a5fb32a0173e220e3f401ff697840f4003e15b96d1befc"}, + {file = "grpcio-1.62.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:b134d5d71b4e0837fff574c00e49176051a1c532d26c052a1e43231f252d813b"}, + {file = "grpcio-1.62.1-cp312-cp312-macosx_10_10_universal2.whl", hash = "sha256:d1f6c96573dc09d50dbcbd91dbf71d5cf97640c9427c32584010fbbd4c0e0037"}, + {file = "grpcio-1.62.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:359f821d4578f80f41909b9ee9b76fb249a21035a061a327f91c953493782c31"}, + {file = "grpcio-1.62.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a485f0c2010c696be269184bdb5ae72781344cb4e60db976c59d84dd6354fac9"}, + {file = "grpcio-1.62.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b50b09b4dc01767163d67e1532f948264167cd27f49e9377e3556c3cba1268e1"}, + {file = "grpcio-1.62.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3227c667dccbe38f2c4d943238b887bac588d97c104815aecc62d2fd976e014b"}, + {file = "grpcio-1.62.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3952b581eb121324853ce2b191dae08badb75cd493cb4e0243368aa9e61cfd41"}, + {file = "grpcio-1.62.1-cp312-cp312-win32.whl", hash = "sha256:83a17b303425104d6329c10eb34bba186ffa67161e63fa6cdae7776ff76df73f"}, + {file = "grpcio-1.62.1-cp312-cp312-win_amd64.whl", hash = "sha256:6696ffe440333a19d8d128e88d440f91fb92c75a80ce4b44d55800e656a3ef1d"}, + {file = "grpcio-1.62.1-cp37-cp37m-linux_armv7l.whl", hash = "sha256:e3393b0823f938253370ebef033c9fd23d27f3eae8eb9a8f6264900c7ea3fb5a"}, + {file = "grpcio-1.62.1-cp37-cp37m-macosx_10_10_universal2.whl", hash = "sha256:83e7ccb85a74beaeae2634f10eb858a0ed1a63081172649ff4261f929bacfd22"}, + {file = "grpcio-1.62.1-cp37-cp37m-manylinux_2_17_aarch64.whl", hash = "sha256:882020c87999d54667a284c7ddf065b359bd00251fcd70279ac486776dbf84ec"}, + {file = "grpcio-1.62.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a10383035e864f386fe096fed5c47d27a2bf7173c56a6e26cffaaa5a361addb1"}, + {file = "grpcio-1.62.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:960edebedc6b9ada1ef58e1c71156f28689978188cd8cff3b646b57288a927d9"}, + {file = "grpcio-1.62.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:23e2e04b83f347d0aadde0c9b616f4726c3d76db04b438fd3904b289a725267f"}, + {file = "grpcio-1.62.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:978121758711916d34fe57c1f75b79cdfc73952f1481bb9583399331682d36f7"}, + {file = "grpcio-1.62.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9084086190cc6d628f282e5615f987288b95457292e969b9205e45b442276407"}, + {file = "grpcio-1.62.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:22bccdd7b23c420a27fd28540fb5dcbc97dc6be105f7698cb0e7d7a420d0e362"}, + {file = "grpcio-1.62.1-cp38-cp38-macosx_10_10_universal2.whl", hash = "sha256:8999bf1b57172dbc7c3e4bb3c732658e918f5c333b2942243f10d0d653953ba9"}, + {file = "grpcio-1.62.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:d9e52558b8b8c2f4ac05ac86344a7417ccdd2b460a59616de49eb6933b07a0bd"}, + {file = "grpcio-1.62.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1714e7bc935780bc3de1b3fcbc7674209adf5208ff825799d579ffd6cd0bd505"}, + {file = "grpcio-1.62.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8842ccbd8c0e253c1f189088228f9b433f7a93b7196b9e5b6f87dba393f5d5d"}, + {file = "grpcio-1.62.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1f1e7b36bdff50103af95a80923bf1853f6823dd62f2d2a2524b66ed74103e49"}, + {file = "grpcio-1.62.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bba97b8e8883a8038606480d6b6772289f4c907f6ba780fa1f7b7da7dfd76f06"}, + {file = "grpcio-1.62.1-cp38-cp38-win32.whl", hash = "sha256:a7f615270fe534548112a74e790cd9d4f5509d744dd718cd442bf016626c22e4"}, + {file = "grpcio-1.62.1-cp38-cp38-win_amd64.whl", hash = "sha256:e6c8c8693df718c5ecbc7babb12c69a4e3677fd11de8886f05ab22d4e6b1c43b"}, + {file = "grpcio-1.62.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:73db2dc1b201d20ab7083e7041946910bb991e7e9761a0394bbc3c2632326483"}, + {file = "grpcio-1.62.1-cp39-cp39-macosx_10_10_universal2.whl", hash = "sha256:407b26b7f7bbd4f4751dbc9767a1f0716f9fe72d3d7e96bb3ccfc4aace07c8de"}, + {file = "grpcio-1.62.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f8de7c8cef9261a2d0a62edf2ccea3d741a523c6b8a6477a340a1f2e417658de"}, + {file = "grpcio-1.62.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd5c8a1af40ec305d001c60236308a67e25419003e9bb3ebfab5695a8d0b369"}, + {file = "grpcio-1.62.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be0477cb31da67846a33b1a75c611f88bfbcd427fe17701b6317aefceee1b96f"}, + {file = "grpcio-1.62.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:60dcd824df166ba266ee0cfaf35a31406cd16ef602b49f5d4dfb21f014b0dedd"}, + {file = "grpcio-1.62.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:973c49086cabab773525f6077f95e5a993bfc03ba8fc32e32f2c279497780585"}, + {file = "grpcio-1.62.1-cp39-cp39-win32.whl", hash = "sha256:12859468e8918d3bd243d213cd6fd6ab07208195dc140763c00dfe901ce1e1b4"}, + {file = "grpcio-1.62.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7209117bbeebdfa5d898205cc55153a51285757902dd73c47de498ad4d11332"}, + {file = "grpcio-1.62.1.tar.gz", hash = "sha256:6c455e008fa86d9e9a9d85bb76da4277c0d7d9668a3bfa70dbe86e9f3c759947"}, ] [package.extras] -protobuf = ["grpcio-tools (>=1.60.1)"] +protobuf = ["grpcio-tools (>=1.62.1)"] [[package]] name = "h11" @@ -1255,13 +1256,13 @@ socks = ["socksio (==1.*)"] [[package]] name = "huggingface-hub" -version = "0.20.3" +version = "0.21.4" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.20.3-py3-none-any.whl", hash = "sha256:d988ae4f00d3e307b0c80c6a05ca6dbb7edba8bba3079f74cda7d9c2e562a7b6"}, - {file = "huggingface_hub-0.20.3.tar.gz", hash = "sha256:94e7f8e074475fbc67d6a71957b678e1b4a74ff1b64a644fd6cbb83da962d05d"}, + {file = "huggingface_hub-0.21.4-py3-none-any.whl", hash = "sha256:df37c2c37fc6c82163cdd8a67ede261687d80d1e262526d6c0ce73b6b3630a7b"}, + {file = "huggingface_hub-0.21.4.tar.gz", hash = "sha256:e1f4968c93726565a80edf6dc309763c7b546d0cfe79aa221206034d50155531"}, ] [package.dependencies] @@ -1278,11 +1279,12 @@ all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", cli = ["InquirerPy (==0.3.4)"] dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.1.3)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] +hf-transfer = ["hf-transfer (>=0.1.4)"] inference = ["aiohttp", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)"] quality = ["mypy (==1.5.1)", "ruff (>=0.1.3)"] tensorflow = ["graphviz", "pydot", "tensorflow"] testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (>1.1,<2.0)", "pydantic (>1.1,<3.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] -torch = ["torch"] +torch = ["safetensors", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] [[package]] @@ -1331,18 +1333,18 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs [[package]] name = "importlib-resources" -version = "6.1.1" +version = "6.1.3" description = "Read resources from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_resources-6.1.1-py3-none-any.whl", hash = "sha256:e8bf90d8213b486f428c9c39714b920041cb02c184686a3dee24905aaa8105d6"}, - {file = "importlib_resources-6.1.1.tar.gz", hash = "sha256:3893a00122eafde6894c59914446a512f728a0c1a45f9bb9b63721b6bacf0b4a"}, + {file = "importlib_resources-6.1.3-py3-none-any.whl", hash = "sha256:4c0269e3580fe2634d364b39b38b961540a7738c02cb984e98add8b4221d793d"}, + {file = "importlib_resources-6.1.3.tar.gz", hash = "sha256:56fb4525197b78544a3354ea27793952ab93f935bb4bf746b846bb1015020f2b"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff", "zipp (>=3.17)"] +testing = ["jaraco.collections", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] [[package]] name = "iniconfig" @@ -1419,13 +1421,13 @@ adal = ["adal (>=1.0.2)"] [[package]] name = "langchain" -version = "0.1.8" +version = "0.1.11" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchain-0.1.8-py3-none-any.whl", hash = "sha256:19e951b0e2be099ff048ee483acecb47e1a39c33a47dadfee70fcfa20f45cc19"}, - {file = "langchain-0.1.8.tar.gz", hash = "sha256:c8b1c2954a07cd6422c9027459473bafae90c78f07015bf2fc6262fadf97ea44"}, + {file = "langchain-0.1.11-py3-none-any.whl", hash = "sha256:b5e678ac50d85370b9bc28f2c97ad5f029aac1c0cca79cac9354adf72741bc6e"}, + {file = "langchain-0.1.11.tar.gz", hash = "sha256:03f08cae7cd3f341c54f1042b3fe24d88f39eba7b7eda942735d8ced13fe6da9"}, ] [package.dependencies] @@ -1433,9 +1435,10 @@ aiohttp = ">=3.8.3,<4.0.0" async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""} dataclasses-json = ">=0.5.7,<0.7" jsonpatch = ">=1.33,<2.0" -langchain-community = ">=0.0.21,<0.1" -langchain-core = ">=0.1.24,<0.2" -langsmith = ">=0.1.0,<0.2.0" +langchain-community = ">=0.0.25,<0.1" +langchain-core = ">=0.1.29,<0.2" +langchain-text-splitters = ">=0.0.1,<0.1" +langsmith = ">=0.1.17,<0.2.0" numpy = ">=1,<2" pydantic = ">=1,<3" PyYAML = ">=5.3" @@ -1444,7 +1447,7 @@ SQLAlchemy = ">=1.4,<3" tenacity = ">=8.1.0,<9.0.0" [package.extras] -azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b8)", "openai (<2)"] +azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b8)", "openai (<2)"] clarifai = ["clarifai (>=9.1.0)"] cli = ["typer (>=0.9.0,<0.10.0)"] cohere = ["cohere (>=4,<5)"] @@ -1459,19 +1462,19 @@ text-helpers = ["chardet (>=5.1.0,<6.0.0)"] [[package]] name = "langchain-community" -version = "0.0.21" +version = "0.0.27" description = "Community contributed LangChain integrations." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchain_community-0.0.21-py3-none-any.whl", hash = "sha256:120977485d244eb472ad3618a31222fe6c2bce08026f4caa96bd6dae2e316ac0"}, - {file = "langchain_community-0.0.21.tar.gz", hash = "sha256:1c310a7e2663d5f6464a433981504894f97c12783cbeb8bdf4159a574f88c18d"}, + {file = "langchain_community-0.0.27-py3-none-any.whl", hash = "sha256:377a7429580a71d909012df5aae538d295fa6f21bc479e5dac6fd1589762b3ab"}, + {file = "langchain_community-0.0.27.tar.gz", hash = "sha256:266dffbd4c1666db1889cad953fa5102d4debff782335353b6d78636a761778d"}, ] [package.dependencies] aiohttp = ">=3.8.3,<4.0.0" dataclasses-json = ">=0.5.7,<0.7" -langchain-core = ">=0.1.24,<0.2" +langchain-core = ">=0.1.30,<0.2.0" langsmith = ">=0.1.0,<0.2.0" numpy = ">=1,<2" PyYAML = ">=5.3" @@ -1481,17 +1484,17 @@ tenacity = ">=8.1.0,<9.0.0" [package.extras] cli = ["typer (>=0.9.0,<0.10.0)"] -extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "tree-sitter (>=0.20.2,<0.21.0)", "tree-sitter-languages (>=1.8.0,<2.0.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"] +extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "azure-ai-documentintelligence (>=1.0.0b1,<2.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "cloudpickle (>=2.0.0)", "cohere (>=4,<5)", "databricks-vectorsearch (>=0.21,<0.22)", "datasets (>=2.15.0,<3.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "elasticsearch (>=8.12.0,<9.0.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.9.0,<0.10.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "gradientai (>=1.4.0,<2.0.0)", "hdbcli (>=2.19.21,<3.0.0)", "hologres-vector (>=0.0.6,<0.0.7)", "html2text (>=2020.1.16,<2021.0.0)", "httpx (>=0.24.1,<0.25.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "nvidia-riva-client (>=2.14.0,<3.0.0)", "oci (>=2.119.1,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "oracle-ads (>=2.9.1,<3.0.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "praw (>=7.7.1,<8.0.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "rdflib (==7.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "tree-sitter (>=0.20.2,<0.21.0)", "tree-sitter-languages (>=1.8.0,<2.0.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)", "zhipuai (>=1.0.7,<2.0.0)"] [[package]] name = "langchain-core" -version = "0.1.25" +version = "0.1.30" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchain_core-0.1.25-py3-none-any.whl", hash = "sha256:ff0a0ad1ed877878e7b9c7601870cd12145abf3c814aae41995968d05ea6c09d"}, - {file = "langchain_core-0.1.25.tar.gz", hash = "sha256:065ff8b4e383c5645d175b20ae44b258330ed06457b0fc0179efee310b6f2af6"}, + {file = "langchain_core-0.1.30-py3-none-any.whl", hash = "sha256:c9643505e41d25ba8f20a2e8bf083d0f0d50b9a098d901511fff8df79f831ada"}, + {file = "langchain_core-0.1.30.tar.gz", hash = "sha256:e13a016e55e7f082ff3eeeda2d0cb505b89a8830e3a23c1d134d0a89d7871894"}, ] [package.dependencies] @@ -1509,51 +1512,68 @@ extended-testing = ["jinja2 (>=3,<4)"] [[package]] name = "langchain-openai" -version = "0.0.6" +version = "0.0.8" description = "An integration package connecting OpenAI and LangChain" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchain_openai-0.0.6-py3-none-any.whl", hash = "sha256:2ef040e4447a26a9d3bd45dfac9cefa00797ea58555a3d91ab4f88699eb3a005"}, - {file = "langchain_openai-0.0.6.tar.gz", hash = "sha256:f5c4ebe46f2c8635c8f0c26cc8df27700aacafea025410e418d5a080039974dd"}, + {file = "langchain_openai-0.0.8-py3-none-any.whl", hash = "sha256:4862fc72cecbee0240aaa6df0234d5893dd30cd33ca23ac5cfdd86c11d2c44df"}, + {file = "langchain_openai-0.0.8.tar.gz", hash = "sha256:b7aba7fcc52305e78b08197ebc54fc45cc06dbc40ba5b913bc48a22b30a4f5c9"}, ] [package.dependencies] -langchain-core = ">=0.1.16,<0.2" -numpy = ">=1,<2" +langchain-core = ">=0.1.27,<0.2.0" openai = ">=1.10.0,<2.0.0" tiktoken = ">=0.5.2,<1" +[[package]] +name = "langchain-text-splitters" +version = "0.0.1" +description = "LangChain text splitting utilities" +optional = false +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "langchain_text_splitters-0.0.1-py3-none-any.whl", hash = "sha256:f5b802f873f5ff6a8b9259ff34d53ed989666ef4e1582e6d1adb3b5520e3839a"}, + {file = "langchain_text_splitters-0.0.1.tar.gz", hash = "sha256:ac459fa98799f5117ad5425a9330b21961321e30bc19a2a2f9f761ddadd62aa1"}, +] + +[package.dependencies] +langchain-core = ">=0.1.28,<0.2.0" + +[package.extras] +extended-testing = ["lxml (>=5.1.0,<6.0.0)"] + [[package]] name = "langsmith" -version = "0.1.5" +version = "0.1.23" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langsmith-0.1.5-py3-none-any.whl", hash = "sha256:a1811821a923d90e53bcbacdd0988c3c366aff8f4c120d8777e7af8ecda06268"}, - {file = "langsmith-0.1.5.tar.gz", hash = "sha256:aa7a2861aa3d9ae563a077c622953533800466c4e2e539b0d567b84d5fd5b157"}, + {file = "langsmith-0.1.23-py3-none-any.whl", hash = "sha256:69984268b9867cb31b875965b3f86b6f56ba17dd5454d487d3a1a999bdaeea69"}, + {file = "langsmith-0.1.23.tar.gz", hash = "sha256:327c66ec0de8c1bc57bfa47bbc70a29ef749e97c3e5571b9baf754d1e0644220"}, ] [package.dependencies] +orjson = ">=3.9.14,<4.0.0" pydantic = ">=1,<3" requests = ">=2,<3" [[package]] name = "llama-index" -version = "0.10.10" +version = "0.10.18" description = "Interface between LLMs and your data" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index-0.10.10-py3-none-any.whl", hash = "sha256:47cb45a3ff0b6576cb221680c179b6b4432cab072f9bca411001721685b37019"}, - {file = "llama_index-0.10.10.tar.gz", hash = "sha256:5570669e30addb7e2113b92a6bd7d39bde40ccdeeb3749dc72df09fe65281504"}, + {file = "llama_index-0.10.18-py3-none-any.whl", hash = "sha256:107a534d72b7d61927c2f013ebae1e385134f2ba044fced8716e1ef55d66cb12"}, + {file = "llama_index-0.10.18.tar.gz", hash = "sha256:116c5d96d783bfcdb88be48f1da63c9d11c56edca062cd3ee85b6054859497f7"}, ] [package.dependencies] llama-index-agent-openai = ">=0.1.4,<0.2.0" llama-index-cli = ">=0.1.2,<0.2.0" -llama-index-core = ">=0.10.9,<0.11.0" +llama-index-core = ">=0.10.18,<0.11.0" llama-index-embeddings-openai = ">=0.1.5,<0.2.0" llama-index-indices-managed-llama-cloud = ">=0.1.2,<0.2.0" llama-index-legacy = ">=0.9.48,<0.10.0" @@ -1566,13 +1586,13 @@ llama-index-readers-llama-parse = ">=0.1.2,<0.2.0" [[package]] name = "llama-index-agent-openai" -version = "0.1.4" +version = "0.1.5" description = "llama-index agent openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_agent_openai-0.1.4-py3-none-any.whl", hash = "sha256:20e9fcef0c0685d4dbdc0f09d0dfe180ad7016c7ea67e6dbb8f7638b9ba388f0"}, - {file = "llama_index_agent_openai-0.1.4.tar.gz", hash = "sha256:d6563d99abf0c5c9fb930aca97b8eee28e125b0e50a35dcd0c27ef1a6a22923e"}, + {file = "llama_index_agent_openai-0.1.5-py3-none-any.whl", hash = "sha256:1ab06fe853d9d391ba724dcb0009b249ae88ca4de4b5842226373b0c55ee435a"}, + {file = "llama_index_agent_openai-0.1.5.tar.gz", hash = "sha256:42099326d526af140493c5f744ef70bef0aed8a941b6c9aea4b3eff9c63f0ba6"}, ] [package.dependencies] @@ -1581,30 +1601,30 @@ llama-index-llms-openai = ">=0.1.5,<0.2.0" [[package]] name = "llama-index-cli" -version = "0.1.3" +version = "0.1.8" description = "llama-index cli" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_cli-0.1.3-py3-none-any.whl", hash = "sha256:7efcbee567b87bae8f4c21787e649d3a4ddaad6ad97e34921fed8b02ac955645"}, - {file = "llama_index_cli-0.1.3.tar.gz", hash = "sha256:7214b43ef85133575846155d0bb0af8509132ee5f02af6ae1630664047cf6fe9"}, + {file = "llama_index_cli-0.1.8-py3-none-any.whl", hash = "sha256:4e300f06206862d6d7eedde95632c6b61a5ebb5162454f1ac7a3c3c9b3ebb05f"}, + {file = "llama_index_cli-0.1.8.tar.gz", hash = "sha256:776a96917965d0df6e7e272d6278394a4f7c922e57973a75e2645609727fa4b1"}, ] [package.dependencies] -llama-index-core = ">=0.10.8.post1,<0.11.0" +llama-index-core = ">=0.10.11.post1,<0.11.0" llama-index-embeddings-openai = ">=0.1.1,<0.2.0" llama-index-llms-openai = ">=0.1.1,<0.2.0" llama-index-vector-stores-chroma = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-core" -version = "0.10.10" +version = "0.10.18.post1" description = "Interface between LLMs and your data" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_core-0.10.10-py3-none-any.whl", hash = "sha256:421629fcde17402d5137f14f317b754f7d348c3478d9e944774e440b8169f278"}, - {file = "llama_index_core-0.10.10.tar.gz", hash = "sha256:8ffb8bd03889ed733106dd1b36edcffb3c5624e2c0569037d4c526c93f478b1c"}, + {file = "llama_index_core-0.10.18.post1-py3-none-any.whl", hash = "sha256:e2383d0e865febfd8dfc96c1d02cd3ee99927c31b2eb3fbb50211d0829e538e5"}, + {file = "llama_index_core-0.10.18.post1.tar.gz", hash = "sha256:0a197f27c13a9d88966469d2361f4f70fe1678d391602dd7a8ec65209c70a24b"}, ] [package.dependencies] @@ -1641,13 +1661,13 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l [[package]] name = "llama-index-embeddings-openai" -version = "0.1.5" +version = "0.1.6" description = "llama-index embeddings openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_embeddings_openai-0.1.5-py3-none-any.whl", hash = "sha256:ce629a4e2f732306e388398f5200cca341c039c4a84df105dc085e2f09d39242"}, - {file = "llama_index_embeddings_openai-0.1.5.tar.gz", hash = "sha256:11c1fea3b00e824e4aca4f136aaf8b941dad5b9532982c24a8bcc00ba8a4d30b"}, + {file = "llama_index_embeddings_openai-0.1.6-py3-none-any.whl", hash = "sha256:f8b2dded0718e9f57c08ce352d186941e6acf7de414c64219210b66f7a6d6d2d"}, + {file = "llama_index_embeddings_openai-0.1.6.tar.gz", hash = "sha256:f12f0ef6f92211efe1a022a97bb68fc8731c93bd20df3b0567dba69c610033db"}, ] [package.dependencies] @@ -1655,13 +1675,13 @@ llama-index-core = ">=0.10.1,<0.11.0" [[package]] name = "llama-index-indices-managed-llama-cloud" -version = "0.1.2" +version = "0.1.3" description = "llama-index indices llama-cloud integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_indices_managed_llama_cloud-0.1.2-py3-none-any.whl", hash = "sha256:54388e9955056e7e747cf58795feb9a8fd759d98e5984b6334d8c7ae7fec6c7c"}, - {file = "llama_index_indices_managed_llama_cloud-0.1.2.tar.gz", hash = "sha256:b3eedd890e4a8a9c4d748022b1ae7dee835d875657f3362df64b0cd8990085a8"}, + {file = "llama_index_indices_managed_llama_cloud-0.1.3-py3-none-any.whl", hash = "sha256:9fe2823855f00bf8b091be008ce953b9a9c5d4b2d976b54ab0d37877c83457f5"}, + {file = "llama_index_indices_managed_llama_cloud-0.1.3.tar.gz", hash = "sha256:5db725cb7db675019dc65e38153890802e2ae89838c127c19d3184efc46ea28b"}, ] [package.dependencies] @@ -1709,13 +1729,13 @@ query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "l [[package]] name = "llama-index-llms-openai" -version = "0.1.5" +version = "0.1.7" description = "llama-index llms openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_llms_openai-0.1.5-py3-none-any.whl", hash = "sha256:37e6f485868118a243a9800626688f0884af9f05a1c8bf592a706f143d2340c0"}, - {file = "llama_index_llms_openai-0.1.5.tar.gz", hash = "sha256:9b294d636395a12c8db622c88b07dd0b7f6fb074215ae453a6a642c535f04627"}, + {file = "llama_index_llms_openai-0.1.7-py3-none-any.whl", hash = "sha256:162a7f1064b389d0db6f731bcedaca80e87ceca8aa919d7425ca32107e756243"}, + {file = "llama_index_llms_openai-0.1.7.tar.gz", hash = "sha256:5ddb405c0a5847a7c2098a70ced270555f036c2793412a8992456bd32f83ff0f"}, ] [package.dependencies] @@ -1723,13 +1743,13 @@ llama-index-core = ">=0.10.1,<0.11.0" [[package]] name = "llama-index-multi-modal-llms-openai" -version = "0.1.3" +version = "0.1.4" description = "llama-index multi-modal-llms openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_multi_modal_llms_openai-0.1.3-py3-none-any.whl", hash = "sha256:cc3affc9977659a874ad360d6410d2f858e90e26643dacd7ebfed5bf59d7f35d"}, - {file = "llama_index_multi_modal_llms_openai-0.1.3.tar.gz", hash = "sha256:e63b98c0473ee128b2ed493ff8cb96bbcfdf0b8d6fe655e73b956158269904ec"}, + {file = "llama_index_multi_modal_llms_openai-0.1.4-py3-none-any.whl", hash = "sha256:03b887d110551d5d5b99b9fd110824e6311f2e31f4d5e67dafd2ee66da32818d"}, + {file = "llama_index_multi_modal_llms_openai-0.1.4.tar.gz", hash = "sha256:6a5d6584c33a9d1b06cf5c874c63af2603fc93b660bde481a8c547e876c6e2c3"}, ] [package.dependencies] @@ -1738,13 +1758,13 @@ llama-index-llms-openai = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-program-openai" -version = "0.1.3" +version = "0.1.4" description = "llama-index program openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_program_openai-0.1.3-py3-none-any.whl", hash = "sha256:6f289836d667ff813451d6640ff8d9eff87ed344cc8141b8be5a314f638b4f49"}, - {file = "llama_index_program_openai-0.1.3.tar.gz", hash = "sha256:1950cd69f2247f69c10fdd731183c553e5b776ac0110b7e55f1c27e866208728"}, + {file = "llama_index_program_openai-0.1.4-py3-none-any.whl", hash = "sha256:cfa8f00f3743d2fc70043e80f7c3925d23b1413a0cc7a72863ad60497a18307d"}, + {file = "llama_index_program_openai-0.1.4.tar.gz", hash = "sha256:573e99a2dd16ad3caf382c8ab28d1ac10eb2571bc9481d84a6d89806ad6aa5d4"}, ] [package.dependencies] @@ -1754,13 +1774,13 @@ llama-index-llms-openai = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-question-gen-openai" -version = "0.1.2" +version = "0.1.3" description = "llama-index question_gen openai integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_question_gen_openai-0.1.2-py3-none-any.whl", hash = "sha256:eb1face5f34177afe557b5a96aaff8caeee108dad2dd683e01b885381548c342"}, - {file = "llama_index_question_gen_openai-0.1.2.tar.gz", hash = "sha256:889b98ac5ccff146e51434b254934a5b2a26c71ca19334654a89fb9e8dbe498d"}, + {file = "llama_index_question_gen_openai-0.1.3-py3-none-any.whl", hash = "sha256:1f83b49e8b2e665030d1ec8c54687d6985d9fa8426147b64e46628a9e489b302"}, + {file = "llama_index_question_gen_openai-0.1.3.tar.gz", hash = "sha256:4486198117a45457d2e036ae60b93af58052893cc7d78fa9b6f47dd47b81e2e1"}, ] [package.dependencies] @@ -1770,13 +1790,13 @@ llama-index-program-openai = ">=0.1.1,<0.2.0" [[package]] name = "llama-index-readers-file" -version = "0.1.4" +version = "0.1.8" description = "llama-index readers file integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_readers_file-0.1.4-py3-none-any.whl", hash = "sha256:d8c6b470b47b452823fccf1795671fe0726e8f96a21c324a7cf7bebef54dce4b"}, - {file = "llama_index_readers_file-0.1.4.tar.gz", hash = "sha256:2533241eb3cb990677cd57cfb1d997f87e4e5ede087dd1ba79069cfb5d255c98"}, + {file = "llama_index_readers_file-0.1.8-py3-none-any.whl", hash = "sha256:f58c72e2c2ed9f36b5308b4b9ee3142f3848156f0e3b85e813db0a26b8d03290"}, + {file = "llama_index_readers_file-0.1.8.tar.gz", hash = "sha256:f23417a2afc8461a32f08f057e85c8d09b1c687ba16ca6a6a08f08f319eca26a"}, ] [package.dependencies] @@ -1788,13 +1808,13 @@ pypdf = ">=4.0.1,<5.0.0" [[package]] name = "llama-index-readers-llama-parse" -version = "0.1.2" +version = "0.1.3" description = "llama-index readers llama-parse integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_readers_llama_parse-0.1.2-py3-none-any.whl", hash = "sha256:df74ad3ac4bb2a26bc6cabb6146e158f4350a6461948b6b0d1ceb5fc51987299"}, - {file = "llama_index_readers_llama_parse-0.1.2.tar.gz", hash = "sha256:fbe29365b0865bcf1166083c2650b2ce2a6ce41e3ece226811260120b53a0b55"}, + {file = "llama_index_readers_llama_parse-0.1.3-py3-none-any.whl", hash = "sha256:f52a06a2765a2ffe6c138cf1703ab1de6249ff069ba62d80b9147e849bbcbc27"}, + {file = "llama_index_readers_llama_parse-0.1.3.tar.gz", hash = "sha256:e0ee0c393e10fc80eac644788338bbd2032050c8b8a474f3d0b5ebd08e9867fe"}, ] [package.dependencies] @@ -1803,30 +1823,28 @@ llama-parse = ">=0.3.3,<0.4.0" [[package]] name = "llama-index-vector-stores-chroma" -version = "0.1.2" +version = "0.1.6" description = "llama-index vector_stores chroma integration" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_index_vector_stores_chroma-0.1.2-py3-none-any.whl", hash = "sha256:86b7d716e659bb2b335447c8064115fad0fcbcebfa84288586d7cdff07b5aa0c"}, - {file = "llama_index_vector_stores_chroma-0.1.2.tar.gz", hash = "sha256:094e9e502239cea78b75a7ab64a564b9e93dddc570305bd5758d09bb2b9556a0"}, + {file = "llama_index_vector_stores_chroma-0.1.6-py3-none-any.whl", hash = "sha256:506b1cb9a7a552ecb3afa70ddf479c1c683fcbfe7313654e3543e62e3ec07eae"}, + {file = "llama_index_vector_stores_chroma-0.1.6.tar.gz", hash = "sha256:6dff3dc9d79c4039fa46cd528e5e4b6ded6473e5ef632424134491007da4ebdb"}, ] [package.dependencies] chromadb = ">=0.4.22,<0.5.0" llama-index-core = ">=0.10.1,<0.11.0" -onnxruntime = ">=1.17.0,<2.0.0" -tokenizers = ">=0.15.1,<0.16.0" [[package]] name = "llama-parse" -version = "0.3.4" +version = "0.3.8" description = "Parse files into RAG-Optimized formats." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "llama_parse-0.3.4-py3-none-any.whl", hash = "sha256:b667c78d4c32fc5d0561e6e3ca6c53648a6701b436f21d0d252cd46774927660"}, - {file = "llama_parse-0.3.4.tar.gz", hash = "sha256:5a30569c390ab9089dad66cf2a8c967f8c21d77641deec0a922672df4e16cfa3"}, + {file = "llama_parse-0.3.8-py3-none-any.whl", hash = "sha256:2f6222c4f9f8b70622a799fca8438972e3e2f19fa8273e2c8be46af314c9a367"}, + {file = "llama_parse-0.3.8.tar.gz", hash = "sha256:3d4739726687e6602e7cacbc9f17d438d39989a4a73324fc99122b3aefa384a4"}, ] [package.dependencies] @@ -1873,22 +1891,21 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] [[package]] name = "marshmallow" -version = "3.20.2" +version = "3.21.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.8" files = [ - {file = "marshmallow-3.20.2-py3-none-any.whl", hash = "sha256:c21d4b98fee747c130e6bc8f45c4b3199ea66bc00c12ee1f639f0aeca034d5e9"}, - {file = "marshmallow-3.20.2.tar.gz", hash = "sha256:4c1daff273513dc5eb24b219a8035559dc573c8f322558ef85f5438ddd1236dd"}, + {file = "marshmallow-3.21.1-py3-none-any.whl", hash = "sha256:f085493f79efb0644f270a9bf2892843142d80d7174bbbd2f3713f2a589dc633"}, + {file = "marshmallow-3.21.1.tar.gz", hash = "sha256:4e65e9e0d80fc9e609574b9983cf32579f305c718afb30d7233ab818571768c3"}, ] [package.dependencies] packaging = ">=17.0" [package.extras] -dev = ["pre-commit (>=2.4,<4.0)", "pytest", "pytz", "simplejson", "tox"] -docs = ["alabaster (==0.7.15)", "autodocsumm (==0.2.12)", "sphinx (==7.2.6)", "sphinx-issues (==3.0.1)", "sphinx-version-warning (==1.1.2)"] -lint = ["pre-commit (>=2.4,<4.0)"] +dev = ["marshmallow[tests]", "pre-commit (>=3.5,<4.0)", "tox"] +docs = ["alabaster (==0.7.16)", "autodocsumm (==0.2.12)", "sphinx (==7.2.6)", "sphinx-issues (==4.0.0)", "sphinx-version-warning (==1.1.2)"] tests = ["pytest", "pytz", "simplejson"] [[package]] @@ -2273,36 +2290,36 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "onnxruntime" -version = "1.17.0" +version = "1.17.1" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = false python-versions = "*" files = [ - {file = "onnxruntime-1.17.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d2b22a25a94109cc983443116da8d9805ced0256eb215c5e6bc6dcbabefeab96"}, - {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4c87d83c6f58d1af2675fc99e3dc810f2dbdb844bcefd0c1b7573632661f6fc"}, - {file = "onnxruntime-1.17.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dba55723bf9b835e358f48c98a814b41692c393eb11f51e02ece0625c756b797"}, - {file = "onnxruntime-1.17.0-cp310-cp310-win32.whl", hash = "sha256:ee48422349cc500273beea7607e33c2237909f58468ae1d6cccfc4aecd158565"}, - {file = "onnxruntime-1.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:f34cc46553359293854e38bdae2ab1be59543aad78a6317e7746d30e311110c3"}, - {file = "onnxruntime-1.17.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:16d26badd092c8c257fa57c458bb600d96dc15282c647ccad0ed7b2732e6c03b"}, - {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f1273bebcdb47ed932d076c85eb9488bc4768fcea16d5f2747ca692fad4f9d3"}, - {file = "onnxruntime-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cb60fd3c2c1acd684752eb9680e89ae223e9801a9b0e0dc7b28adabe45a2e380"}, - {file = "onnxruntime-1.17.0-cp311-cp311-win32.whl", hash = "sha256:4b038324586bc905299e435f7c00007e6242389c856b82fe9357fdc3b1ef2bdc"}, - {file = "onnxruntime-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:93d39b3fa1ee01f034f098e1c7769a811a21365b4883f05f96c14a2b60c6028b"}, - {file = "onnxruntime-1.17.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:90c0890e36f880281c6c698d9bc3de2afbeee2f76512725ec043665c25c67d21"}, - {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7466724e809a40e986b1637cba156ad9fc0d1952468bc00f79ef340bc0199552"}, - {file = "onnxruntime-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d47bee7557a8b99c8681b6882657a515a4199778d6d5e24e924d2aafcef55b0a"}, - {file = "onnxruntime-1.17.0-cp312-cp312-win32.whl", hash = "sha256:bb1bf1ee575c665b8bbc3813ab906e091a645a24ccc210be7932154b8260eca1"}, - {file = "onnxruntime-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:ac2f286da3494b29b4186ca193c7d4e6a2c1f770c4184c7192c5da142c3dec28"}, - {file = "onnxruntime-1.17.0-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:1ec485643b93e0a3896c655eb2426decd63e18a278bb7ccebc133b340723624f"}, - {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:83c35809cda898c5a11911c69ceac8a2ac3925911854c526f73bad884582f911"}, - {file = "onnxruntime-1.17.0-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fa464aa4d81df818375239e481887b656e261377d5b6b9a4692466f5f3261edc"}, - {file = "onnxruntime-1.17.0-cp38-cp38-win32.whl", hash = "sha256:b7b337cd0586f7836601623cbd30a443df9528ef23965860d11c753ceeb009f2"}, - {file = "onnxruntime-1.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:fbb9faaf51d01aa2c147ef52524d9326744c852116d8005b9041809a71838878"}, - {file = "onnxruntime-1.17.0-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:5a06ab84eaa350bf64b1d747b33ccf10da64221ed1f38f7287f15eccbec81603"}, - {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d3d11db2c8242766212a68d0b139745157da7ce53bd96ba349a5c65e5a02357"}, - {file = "onnxruntime-1.17.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5632077c3ab8b0cd4f74b0af9c4e924be012b1a7bcd7daa845763c6c6bf14b7d"}, - {file = "onnxruntime-1.17.0-cp39-cp39-win32.whl", hash = "sha256:61a12732cba869b3ad2d4e29ab6cb62c7a96f61b8c213f7fcb961ba412b70b37"}, - {file = "onnxruntime-1.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:461fa0fc7d9c392c352b6cccdedf44d818430f3d6eacd924bb804fdea2dcfd02"}, + {file = "onnxruntime-1.17.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d43ac17ac4fa3c9096ad3c0e5255bb41fd134560212dc124e7f52c3159af5d21"}, + {file = "onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:55b5e92a4c76a23981c998078b9bf6145e4fb0b016321a8274b1607bd3c6bd35"}, + {file = "onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ebbcd2bc3a066cf54e6f18c75708eb4d309ef42be54606d22e5bdd78afc5b0d7"}, + {file = "onnxruntime-1.17.1-cp310-cp310-win32.whl", hash = "sha256:5e3716b5eec9092e29a8d17aab55e737480487deabfca7eac3cd3ed952b6ada9"}, + {file = "onnxruntime-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:fbb98cced6782ae1bb799cc74ddcbbeeae8819f3ad1d942a74d88e72b6511337"}, + {file = "onnxruntime-1.17.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:36fd6f87a1ecad87e9c652e42407a50fb305374f9a31d71293eb231caae18784"}, + {file = "onnxruntime-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:99a8bddeb538edabc524d468edb60ad4722cff8a49d66f4e280c39eace70500b"}, + {file = "onnxruntime-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd7fddb4311deb5a7d3390cd8e9b3912d4d963efbe4dfe075edbaf18d01c024e"}, + {file = "onnxruntime-1.17.1-cp311-cp311-win32.whl", hash = "sha256:606a7cbfb6680202b0e4f1890881041ffc3ac6e41760a25763bd9fe146f0b335"}, + {file = "onnxruntime-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:53e4e06c0a541696ebdf96085fd9390304b7b04b748a19e02cf3b35c869a1e76"}, + {file = "onnxruntime-1.17.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:40f08e378e0f85929712a2b2c9b9a9cc400a90c8a8ca741d1d92c00abec60843"}, + {file = "onnxruntime-1.17.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac79da6d3e1bb4590f1dad4bb3c2979d7228555f92bb39820889af8b8e6bd472"}, + {file = "onnxruntime-1.17.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ae9ba47dc099004e3781f2d0814ad710a13c868c739ab086fc697524061695ea"}, + {file = "onnxruntime-1.17.1-cp312-cp312-win32.whl", hash = "sha256:2dff1a24354220ac30e4a4ce2fb1df38cb1ea59f7dac2c116238d63fe7f4c5ff"}, + {file = "onnxruntime-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:6226a5201ab8cafb15e12e72ff2a4fc8f50654e8fa5737c6f0bd57c5ff66827e"}, + {file = "onnxruntime-1.17.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:cd0c07c0d1dfb8629e820b05fda5739e4835b3b82faf43753d2998edf2cf00aa"}, + {file = "onnxruntime-1.17.1-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:617ebdf49184efa1ba6e4467e602fbfa029ed52c92f13ce3c9f417d303006381"}, + {file = "onnxruntime-1.17.1-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9dae9071e3facdf2920769dceee03b71c684b6439021defa45b830d05e148924"}, + {file = "onnxruntime-1.17.1-cp38-cp38-win32.whl", hash = "sha256:835d38fa1064841679433b1aa8138b5e1218ddf0cfa7a3ae0d056d8fd9cec713"}, + {file = "onnxruntime-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:96621e0c555c2453bf607606d08af3f70fbf6f315230c28ddea91754e17ad4e6"}, + {file = "onnxruntime-1.17.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:7a9539935fb2d78ebf2cf2693cad02d9930b0fb23cdd5cf37a7df813e977674d"}, + {file = "onnxruntime-1.17.1-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:45c6a384e9d9a29c78afff62032a46a993c477b280247a7e335df09372aedbe9"}, + {file = "onnxruntime-1.17.1-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4e19f966450f16863a1d6182a685ca33ae04d7772a76132303852d05b95411ea"}, + {file = "onnxruntime-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e2ae712d64a42aac29ed7a40a426cb1e624a08cfe9273dcfe681614aa65b07dc"}, + {file = "onnxruntime-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:f7e9f7fb049825cdddf4a923cfc7c649d84d63c0134315f8e0aa9e0c3004672c"}, ] [package.dependencies] @@ -2315,13 +2332,13 @@ sympy = "*" [[package]] name = "openai" -version = "1.12.0" +version = "1.13.3" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.12.0-py3-none-any.whl", hash = "sha256:a54002c814e05222e413664f651b5916714e4700d041d5cf5724d3ae1a3e3481"}, - {file = "openai-1.12.0.tar.gz", hash = "sha256:99c5d257d09ea6533d689d1cc77caa0ac679fa21efef8893d8b0832a86877f1b"}, + {file = "openai-1.13.3-py3-none-any.whl", hash = "sha256:5769b62abd02f350a8dd1a3a242d8972c947860654466171d60fb0972ae0a41c"}, + {file = "openai-1.13.3.tar.gz", hash = "sha256:ff6c6b3bc7327e715e4b3592a923a5a1c7519ff5dd764a83d69f633d49e77a7b"}, ] [package.dependencies] @@ -2338,13 +2355,13 @@ datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] [[package]] name = "opentelemetry-api" -version = "1.22.0" +version = "1.23.0" description = "OpenTelemetry Python API" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_api-1.22.0-py3-none-any.whl", hash = "sha256:43621514301a7e9f5d06dd8013a1b450f30c2e9372b8e30aaeb4562abf2ce034"}, - {file = "opentelemetry_api-1.22.0.tar.gz", hash = "sha256:15ae4ca925ecf9cfdfb7a709250846fbb08072260fca08ade78056c502b86bed"}, + {file = "opentelemetry_api-1.23.0-py3-none-any.whl", hash = "sha256:cc03ea4025353048aadb9c64919099663664672ea1c6be6ddd8fee8e4cd5e774"}, + {file = "opentelemetry_api-1.23.0.tar.gz", hash = "sha256:14a766548c8dd2eb4dfc349739eb4c3893712a0daa996e5dbf945f9da665da9d"}, ] [package.dependencies] @@ -2353,52 +2370,50 @@ importlib-metadata = ">=6.0,<7.0" [[package]] name = "opentelemetry-exporter-otlp-proto-common" -version = "1.22.0" +version = "1.23.0" description = "OpenTelemetry Protobuf encoding" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_exporter_otlp_proto_common-1.22.0-py3-none-any.whl", hash = "sha256:3f2538bec5312587f8676c332b3747f54c89fe6364803a807e217af4603201fa"}, - {file = "opentelemetry_exporter_otlp_proto_common-1.22.0.tar.gz", hash = "sha256:71ae2f81bc6d6fe408d06388826edc8933759b2ca3a97d24054507dc7cfce52d"}, + {file = "opentelemetry_exporter_otlp_proto_common-1.23.0-py3-none-any.whl", hash = "sha256:2a9e7e9d5a8b026b572684b6b24dcdefcaa58613d5ce3d644130b0c373c056c1"}, + {file = "opentelemetry_exporter_otlp_proto_common-1.23.0.tar.gz", hash = "sha256:35e4ea909e7a0b24235bd0aaf17fba49676527feb1823b46565ff246d5a1ab18"}, ] [package.dependencies] -backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} -opentelemetry-proto = "1.22.0" +opentelemetry-proto = "1.23.0" [[package]] name = "opentelemetry-exporter-otlp-proto-grpc" -version = "1.22.0" +version = "1.23.0" description = "OpenTelemetry Collector Protobuf over gRPC Exporter" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_exporter_otlp_proto_grpc-1.22.0-py3-none-any.whl", hash = "sha256:b5bcadc129272004316a455e9081216d3380c1fc2231a928ea6a70aa90e173fb"}, - {file = "opentelemetry_exporter_otlp_proto_grpc-1.22.0.tar.gz", hash = "sha256:1e0e5aa4bbabc74942f06f268deffd94851d12a8dc30b02527472ef1729fe5b1"}, + {file = "opentelemetry_exporter_otlp_proto_grpc-1.23.0-py3-none-any.whl", hash = "sha256:40f9e3e7761eb34f2a1001f4543028783ac26e2db27e420d5374f2cca0182dad"}, + {file = "opentelemetry_exporter_otlp_proto_grpc-1.23.0.tar.gz", hash = "sha256:aa1a012eea5342bfef51fcf3f7f22601dcb0f0984a07ffe6025b2fbb6d91a2a9"}, ] [package.dependencies] -backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} deprecated = ">=1.2.6" googleapis-common-protos = ">=1.52,<2.0" grpcio = ">=1.0.0,<2.0.0" opentelemetry-api = ">=1.15,<2.0" -opentelemetry-exporter-otlp-proto-common = "1.22.0" -opentelemetry-proto = "1.22.0" -opentelemetry-sdk = ">=1.22.0,<1.23.0" +opentelemetry-exporter-otlp-proto-common = "1.23.0" +opentelemetry-proto = "1.23.0" +opentelemetry-sdk = ">=1.23.0,<1.24.0" [package.extras] test = ["pytest-grpc"] [[package]] name = "opentelemetry-instrumentation" -version = "0.43b0" +version = "0.44b0" description = "Instrumentation Tools & Auto Instrumentation for OpenTelemetry Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_instrumentation-0.43b0-py3-none-any.whl", hash = "sha256:0ff1334d7e359e27640e9d420024efeb73eacae464309c2e14ede7ba6c93967e"}, - {file = "opentelemetry_instrumentation-0.43b0.tar.gz", hash = "sha256:c3755da6c4be8033be0216d0501e11f4832690f4e2eca5a3576fbf113498f0f6"}, + {file = "opentelemetry_instrumentation-0.44b0-py3-none-any.whl", hash = "sha256:79560f386425176bcc60c59190064597096114c4a8e5154f1cb281bb4e47d2fc"}, + {file = "opentelemetry_instrumentation-0.44b0.tar.gz", hash = "sha256:8213d02d8c0987b9b26386ae3e091e0477d6331673123df736479322e1a50b48"}, ] [package.dependencies] @@ -2408,57 +2423,57 @@ wrapt = ">=1.0.0,<2.0.0" [[package]] name = "opentelemetry-instrumentation-asgi" -version = "0.43b0" +version = "0.44b0" description = "ASGI instrumentation for OpenTelemetry" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_instrumentation_asgi-0.43b0-py3-none-any.whl", hash = "sha256:1f593829fa039e9367820736fb063e92acd15c25b53d7bcb5d319971b8e93fd7"}, - {file = "opentelemetry_instrumentation_asgi-0.43b0.tar.gz", hash = "sha256:3f6f19333dca31ef696672e4e36cb1c2613c71dc7e847c11ff36a37e1130dadc"}, + {file = "opentelemetry_instrumentation_asgi-0.44b0-py3-none-any.whl", hash = "sha256:0d95c84a8991008c8a8ac35e15d43cc7768a5bb46f95f129e802ad2990d7c366"}, + {file = "opentelemetry_instrumentation_asgi-0.44b0.tar.gz", hash = "sha256:72d4d28ec7ccd551eac11edc5ae8cac3586c0a228467d6a95fad7b6d4edd597a"}, ] [package.dependencies] asgiref = ">=3.0,<4.0" opentelemetry-api = ">=1.12,<2.0" -opentelemetry-instrumentation = "0.43b0" -opentelemetry-semantic-conventions = "0.43b0" -opentelemetry-util-http = "0.43b0" +opentelemetry-instrumentation = "0.44b0" +opentelemetry-semantic-conventions = "0.44b0" +opentelemetry-util-http = "0.44b0" [package.extras] instruments = ["asgiref (>=3.0,<4.0)"] -test = ["opentelemetry-instrumentation-asgi[instruments]", "opentelemetry-test-utils (==0.43b0)"] +test = ["opentelemetry-instrumentation-asgi[instruments]", "opentelemetry-test-utils (==0.44b0)"] [[package]] name = "opentelemetry-instrumentation-fastapi" -version = "0.43b0" +version = "0.44b0" description = "OpenTelemetry FastAPI Instrumentation" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_instrumentation_fastapi-0.43b0-py3-none-any.whl", hash = "sha256:b79c044df68a52e07b35fa12a424e7cc0dd27ff0a171c5fdcc41dea9de8fc938"}, - {file = "opentelemetry_instrumentation_fastapi-0.43b0.tar.gz", hash = "sha256:2afaaf470622e1a2732182c68f6d2431ffe5e026a7edacd0f83605632b66347f"}, + {file = "opentelemetry_instrumentation_fastapi-0.44b0-py3-none-any.whl", hash = "sha256:4441482944bea6676816668d56deb94af990e8c6e9582c581047e5d84c91d3c9"}, + {file = "opentelemetry_instrumentation_fastapi-0.44b0.tar.gz", hash = "sha256:67ed10b93ad9d35238ae0be73cf8acbbb65a4a61fb7444d0aee5b0c492e294db"}, ] [package.dependencies] opentelemetry-api = ">=1.12,<2.0" -opentelemetry-instrumentation = "0.43b0" -opentelemetry-instrumentation-asgi = "0.43b0" -opentelemetry-semantic-conventions = "0.43b0" -opentelemetry-util-http = "0.43b0" +opentelemetry-instrumentation = "0.44b0" +opentelemetry-instrumentation-asgi = "0.44b0" +opentelemetry-semantic-conventions = "0.44b0" +opentelemetry-util-http = "0.44b0" [package.extras] instruments = ["fastapi (>=0.58,<1.0)"] -test = ["httpx (>=0.22,<1.0)", "opentelemetry-instrumentation-fastapi[instruments]", "opentelemetry-test-utils (==0.43b0)", "requests (>=2.23,<3.0)"] +test = ["httpx (>=0.22,<1.0)", "opentelemetry-instrumentation-fastapi[instruments]", "opentelemetry-test-utils (==0.44b0)", "requests (>=2.23,<3.0)"] [[package]] name = "opentelemetry-proto" -version = "1.22.0" +version = "1.23.0" description = "OpenTelemetry Python Proto" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_proto-1.22.0-py3-none-any.whl", hash = "sha256:ce7188d22c75b6d0fe53e7fb58501613d0feade5139538e79dedd9420610fa0c"}, - {file = "opentelemetry_proto-1.22.0.tar.gz", hash = "sha256:9ec29169286029f17ca34ec1f3455802ffb90131642d2f545ece9a63e8f69003"}, + {file = "opentelemetry_proto-1.23.0-py3-none-any.whl", hash = "sha256:4c017deca052cb287a6003b7c989ed8b47af65baeb5d57ebf93dde0793f78509"}, + {file = "opentelemetry_proto-1.23.0.tar.gz", hash = "sha256:e6aaf8b7ace8d021942d546161401b83eed90f9f2cc6f13275008cea730e4651"}, ] [package.dependencies] @@ -2466,40 +2481,99 @@ protobuf = ">=3.19,<5.0" [[package]] name = "opentelemetry-sdk" -version = "1.22.0" +version = "1.23.0" description = "OpenTelemetry Python SDK" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_sdk-1.22.0-py3-none-any.whl", hash = "sha256:a730555713d7c8931657612a88a141e3a4fe6eb5523d9e2d5a8b1e673d76efa6"}, - {file = "opentelemetry_sdk-1.22.0.tar.gz", hash = "sha256:45267ac1f38a431fc2eb5d6e0c0d83afc0b78de57ac345488aa58c28c17991d0"}, + {file = "opentelemetry_sdk-1.23.0-py3-none-any.whl", hash = "sha256:a93c96990ac0f07c6d679e2f1015864ff7a4f5587122dd5af968034436efb1fd"}, + {file = "opentelemetry_sdk-1.23.0.tar.gz", hash = "sha256:9ddf60195837b59e72fd2033d6a47e2b59a0f74f0ec37d89387d89e3da8cab7f"}, ] [package.dependencies] -opentelemetry-api = "1.22.0" -opentelemetry-semantic-conventions = "0.43b0" +opentelemetry-api = "1.23.0" +opentelemetry-semantic-conventions = "0.44b0" typing-extensions = ">=3.7.4" [[package]] name = "opentelemetry-semantic-conventions" -version = "0.43b0" +version = "0.44b0" description = "OpenTelemetry Semantic Conventions" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "opentelemetry_semantic_conventions-0.43b0-py3-none-any.whl", hash = "sha256:291284d7c1bf15fdaddf309b3bd6d3b7ce12a253cec6d27144439819a15d8445"}, - {file = "opentelemetry_semantic_conventions-0.43b0.tar.gz", hash = "sha256:b9576fb890df479626fa624e88dde42d3d60b8b6c8ae1152ad157a8b97358635"}, + {file = "opentelemetry_semantic_conventions-0.44b0-py3-none-any.whl", hash = "sha256:7c434546c9cbd797ab980cc88bf9ff3f4a5a28f941117cad21694e43d5d92019"}, + {file = "opentelemetry_semantic_conventions-0.44b0.tar.gz", hash = "sha256:2e997cb28cd4ca81a25a9a43365f593d0c2b76be0685015349a89abdf1aa4ffa"}, ] [[package]] name = "opentelemetry-util-http" -version = "0.43b0" +version = "0.44b0" description = "Web util for OpenTelemetry" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" +files = [ + {file = "opentelemetry_util_http-0.44b0-py3-none-any.whl", hash = "sha256:ff018ab6a2fa349537ff21adcef99a294248b599be53843c44f367aef6bccea5"}, + {file = "opentelemetry_util_http-0.44b0.tar.gz", hash = "sha256:75896dffcbbeb5df5429ad4526e22307fc041a27114e0c5bfd90bb219381e68f"}, +] + +[[package]] +name = "orjson" +version = "3.9.15" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = false +python-versions = ">=3.8" files = [ - {file = "opentelemetry_util_http-0.43b0-py3-none-any.whl", hash = "sha256:f25a820784b030f6cb86b3d76e5676c769b75ed3f55a210bcdae0a5e175ebadb"}, - {file = "opentelemetry_util_http-0.43b0.tar.gz", hash = "sha256:3ff6ab361dbe99fc81200d625603c0fb890c055c6e416a3e6d661ddf47a6c7f7"}, + {file = "orjson-3.9.15-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:d61f7ce4727a9fa7680cd6f3986b0e2c732639f46a5e0156e550e35258aa313a"}, + {file = "orjson-3.9.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4feeb41882e8aa17634b589533baafdceb387e01e117b1ec65534ec724023d04"}, + {file = "orjson-3.9.15-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fbbeb3c9b2edb5fd044b2a070f127a0ac456ffd079cb82746fc84af01ef021a4"}, + {file = "orjson-3.9.15-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b66bcc5670e8a6b78f0313bcb74774c8291f6f8aeef10fe70e910b8040f3ab75"}, + {file = "orjson-3.9.15-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2973474811db7b35c30248d1129c64fd2bdf40d57d84beed2a9a379a6f57d0ab"}, + {file = "orjson-3.9.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fe41b6f72f52d3da4db524c8653e46243c8c92df826ab5ffaece2dba9cccd58"}, + {file = "orjson-3.9.15-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4228aace81781cc9d05a3ec3a6d2673a1ad0d8725b4e915f1089803e9efd2b99"}, + {file = "orjson-3.9.15-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6f7b65bfaf69493c73423ce9db66cfe9138b2f9ef62897486417a8fcb0a92bfe"}, + {file = "orjson-3.9.15-cp310-none-win32.whl", hash = "sha256:2d99e3c4c13a7b0fb3792cc04c2829c9db07838fb6973e578b85c1745e7d0ce7"}, + {file = "orjson-3.9.15-cp310-none-win_amd64.whl", hash = "sha256:b725da33e6e58e4a5d27958568484aa766e825e93aa20c26c91168be58e08cbb"}, + {file = "orjson-3.9.15-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c8e8fe01e435005d4421f183038fc70ca85d2c1e490f51fb972db92af6e047c2"}, + {file = "orjson-3.9.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87f1097acb569dde17f246faa268759a71a2cb8c96dd392cd25c668b104cad2f"}, + {file = "orjson-3.9.15-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff0f9913d82e1d1fadbd976424c316fbc4d9c525c81d047bbdd16bd27dd98cfc"}, + {file = "orjson-3.9.15-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8055ec598605b0077e29652ccfe9372247474375e0e3f5775c91d9434e12d6b1"}, + {file = "orjson-3.9.15-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d6768a327ea1ba44c9114dba5fdda4a214bdb70129065cd0807eb5f010bfcbb5"}, + {file = "orjson-3.9.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:12365576039b1a5a47df01aadb353b68223da413e2e7f98c02403061aad34bde"}, + {file = "orjson-3.9.15-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:71c6b009d431b3839d7c14c3af86788b3cfac41e969e3e1c22f8a6ea13139404"}, + {file = "orjson-3.9.15-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e18668f1bd39e69b7fed19fa7cd1cd110a121ec25439328b5c89934e6d30d357"}, + {file = "orjson-3.9.15-cp311-none-win32.whl", hash = "sha256:62482873e0289cf7313461009bf62ac8b2e54bc6f00c6fabcde785709231a5d7"}, + {file = "orjson-3.9.15-cp311-none-win_amd64.whl", hash = "sha256:b3d336ed75d17c7b1af233a6561cf421dee41d9204aa3cfcc6c9c65cd5bb69a8"}, + {file = "orjson-3.9.15-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:82425dd5c7bd3adfe4e94c78e27e2fa02971750c2b7ffba648b0f5d5cc016a73"}, + {file = "orjson-3.9.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c51378d4a8255b2e7c1e5cc430644f0939539deddfa77f6fac7b56a9784160a"}, + {file = "orjson-3.9.15-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6ae4e06be04dc00618247c4ae3f7c3e561d5bc19ab6941427f6d3722a0875ef7"}, + {file = "orjson-3.9.15-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcef128f970bb63ecf9a65f7beafd9b55e3aaf0efc271a4154050fc15cdb386e"}, + {file = "orjson-3.9.15-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b72758f3ffc36ca566ba98a8e7f4f373b6c17c646ff8ad9b21ad10c29186f00d"}, + {file = "orjson-3.9.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c57bc7b946cf2efa67ac55766e41764b66d40cbd9489041e637c1304400494"}, + {file = "orjson-3.9.15-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:946c3a1ef25338e78107fba746f299f926db408d34553b4754e90a7de1d44068"}, + {file = "orjson-3.9.15-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2f256d03957075fcb5923410058982aea85455d035607486ccb847f095442bda"}, + {file = "orjson-3.9.15-cp312-none-win_amd64.whl", hash = "sha256:5bb399e1b49db120653a31463b4a7b27cf2fbfe60469546baf681d1b39f4edf2"}, + {file = "orjson-3.9.15-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:b17f0f14a9c0ba55ff6279a922d1932e24b13fc218a3e968ecdbf791b3682b25"}, + {file = "orjson-3.9.15-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f6cbd8e6e446fb7e4ed5bac4661a29e43f38aeecbf60c4b900b825a353276a1"}, + {file = "orjson-3.9.15-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:76bc6356d07c1d9f4b782813094d0caf1703b729d876ab6a676f3aaa9a47e37c"}, + {file = "orjson-3.9.15-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fdfa97090e2d6f73dced247a2f2d8004ac6449df6568f30e7fa1a045767c69a6"}, + {file = "orjson-3.9.15-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7413070a3e927e4207d00bd65f42d1b780fb0d32d7b1d951f6dc6ade318e1b5a"}, + {file = "orjson-3.9.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9cf1596680ac1f01839dba32d496136bdd5d8ffb858c280fa82bbfeb173bdd40"}, + {file = "orjson-3.9.15-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:809d653c155e2cc4fd39ad69c08fdff7f4016c355ae4b88905219d3579e31eb7"}, + {file = "orjson-3.9.15-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:920fa5a0c5175ab14b9c78f6f820b75804fb4984423ee4c4f1e6d748f8b22bc1"}, + {file = "orjson-3.9.15-cp38-none-win32.whl", hash = "sha256:2b5c0f532905e60cf22a511120e3719b85d9c25d0e1c2a8abb20c4dede3b05a5"}, + {file = "orjson-3.9.15-cp38-none-win_amd64.whl", hash = "sha256:67384f588f7f8daf040114337d34a5188346e3fae6c38b6a19a2fe8c663a2f9b"}, + {file = "orjson-3.9.15-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6fc2fe4647927070df3d93f561d7e588a38865ea0040027662e3e541d592811e"}, + {file = "orjson-3.9.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34cbcd216e7af5270f2ffa63a963346845eb71e174ea530867b7443892d77180"}, + {file = "orjson-3.9.15-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f541587f5c558abd93cb0de491ce99a9ef8d1ae29dd6ab4dbb5a13281ae04cbd"}, + {file = "orjson-3.9.15-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92255879280ef9c3c0bcb327c5a1b8ed694c290d61a6a532458264f887f052cb"}, + {file = "orjson-3.9.15-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a1f57fb601c426635fcae9ddbe90dfc1ed42245eb4c75e4960440cac667262"}, + {file = "orjson-3.9.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ede0bde16cc6e9b96633df1631fbcd66491d1063667f260a4f2386a098393790"}, + {file = "orjson-3.9.15-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:e88b97ef13910e5f87bcbc4dd7979a7de9ba8702b54d3204ac587e83639c0c2b"}, + {file = "orjson-3.9.15-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:57d5d8cf9c27f7ef6bc56a5925c7fbc76b61288ab674eb352c26ac780caa5b10"}, + {file = "orjson-3.9.15-cp39-none-win32.whl", hash = "sha256:001f4eb0ecd8e9ebd295722d0cbedf0748680fb9998d3993abaed2f40587257a"}, + {file = "orjson-3.9.15-cp39-none-win_amd64.whl", hash = "sha256:ea0b183a5fe6b2b45f3b854b0d19c4e932d6f5934ae1f723b07cf9560edd4ec7"}, + {file = "orjson-3.9.15.tar.gz", hash = "sha256:95cae920959d772f30ab36d3b25f83bb0f3be671e986c72ce22f8fa700dae061"}, ] [[package]] @@ -2526,40 +2600,40 @@ files = [ [[package]] name = "pandas" -version = "2.2.0" +version = "2.2.1" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" files = [ - {file = "pandas-2.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8108ee1712bb4fa2c16981fba7e68b3f6ea330277f5ca34fa8d557e986a11670"}, - {file = "pandas-2.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:736da9ad4033aeab51d067fc3bd69a0ba36f5a60f66a527b3d72e2030e63280a"}, - {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38e0b4fc3ddceb56ec8a287313bc22abe17ab0eb184069f08fc6a9352a769b18"}, - {file = "pandas-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20404d2adefe92aed3b38da41d0847a143a09be982a31b85bc7dd565bdba0f4e"}, - {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7ea3ee3f125032bfcade3a4cf85131ed064b4f8dd23e5ce6fa16473e48ebcaf5"}, - {file = "pandas-2.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f9670b3ac00a387620489dfc1bca66db47a787f4e55911f1293063a78b108df1"}, - {file = "pandas-2.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a946f210383c7e6d16312d30b238fd508d80d927014f3b33fb5b15c2f895430"}, - {file = "pandas-2.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a1b438fa26b208005c997e78672f1aa8138f67002e833312e6230f3e57fa87d5"}, - {file = "pandas-2.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ce2fbc8d9bf303ce54a476116165220a1fedf15985b09656b4b4275300e920b"}, - {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2707514a7bec41a4ab81f2ccce8b382961a29fbe9492eab1305bb075b2b1ff4f"}, - {file = "pandas-2.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85793cbdc2d5bc32620dc8ffa715423f0c680dacacf55056ba13454a5be5de88"}, - {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:cfd6c2491dc821b10c716ad6776e7ab311f7df5d16038d0b7458bc0b67dc10f3"}, - {file = "pandas-2.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a146b9dcacc3123aa2b399df1a284de5f46287a4ab4fbfc237eac98a92ebcb71"}, - {file = "pandas-2.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:fbc1b53c0e1fdf16388c33c3cca160f798d38aea2978004dd3f4d3dec56454c9"}, - {file = "pandas-2.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a41d06f308a024981dcaa6c41f2f2be46a6b186b902c94c2674e8cb5c42985bc"}, - {file = "pandas-2.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:159205c99d7a5ce89ecfc37cb08ed179de7783737cea403b295b5eda8e9c56d1"}, - {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1e1f3861ea9132b32f2133788f3b14911b68102d562715d71bd0013bc45440"}, - {file = "pandas-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:761cb99b42a69005dec2b08854fb1d4888fdf7b05db23a8c5a099e4b886a2106"}, - {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a20628faaf444da122b2a64b1e5360cde100ee6283ae8effa0d8745153809a2e"}, - {file = "pandas-2.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f5be5d03ea2073627e7111f61b9f1f0d9625dc3c4d8dda72cc827b0c58a1d042"}, - {file = "pandas-2.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:a626795722d893ed6aacb64d2401d017ddc8a2341b49e0384ab9bf7112bdec30"}, - {file = "pandas-2.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9f66419d4a41132eb7e9a73dcec9486cf5019f52d90dd35547af11bc58f8637d"}, - {file = "pandas-2.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:57abcaeda83fb80d447f28ab0cc7b32b13978f6f733875ebd1ed14f8fbc0f4ab"}, - {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e60f1f7dba3c2d5ca159e18c46a34e7ca7247a73b5dd1a22b6d59707ed6b899a"}, - {file = "pandas-2.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb61dc8567b798b969bcc1fc964788f5a68214d333cade8319c7ab33e2b5d88a"}, - {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:52826b5f4ed658fa2b729264d63f6732b8b29949c7fd234510d57c61dbeadfcd"}, - {file = "pandas-2.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bde2bc699dbd80d7bc7f9cab1e23a95c4375de615860ca089f34e7c64f4a8de7"}, - {file = "pandas-2.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:3de918a754bbf2da2381e8a3dcc45eede8cd7775b047b923f9006d5f876802ae"}, - {file = "pandas-2.2.0.tar.gz", hash = "sha256:30b83f7c3eb217fb4d1b494a57a2fda5444f17834f5df2de6b2ffff68dc3c8e2"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8df8612be9cd1c7797c93e1c5df861b2ddda0b48b08f2c3eaa0702cf88fb5f88"}, + {file = "pandas-2.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f573ab277252ed9aaf38240f3b54cfc90fff8e5cab70411ee1d03f5d51f3944"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f02a3a6c83df4026e55b63c1f06476c9aa3ed6af3d89b4f04ea656ccdaaaa359"}, + {file = "pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c38ce92cb22a4bea4e3929429aa1067a454dcc9c335799af93ba9be21b6beb51"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:c2ce852e1cf2509a69e98358e8458775f89599566ac3775e70419b98615f4b06"}, + {file = "pandas-2.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:53680dc9b2519cbf609c62db3ed7c0b499077c7fefda564e330286e619ff0dd9"}, + {file = "pandas-2.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:94e714a1cca63e4f5939cdce5f29ba8d415d85166be3441165edd427dc9f6bc0"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f821213d48f4ab353d20ebc24e4faf94ba40d76680642fb7ce2ea31a3ad94f9b"}, + {file = "pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c70e00c2d894cb230e5c15e4b1e1e6b2b478e09cf27cc593a11ef955b9ecc81a"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e97fbb5387c69209f134893abc788a6486dbf2f9e511070ca05eed4b930b1b02"}, + {file = "pandas-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101d0eb9c5361aa0146f500773395a03839a5e6ecde4d4b6ced88b7e5a1a6403"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:7d2ed41c319c9fb4fd454fe25372028dfa417aacb9790f68171b2e3f06eae8cd"}, + {file = "pandas-2.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:af5d3c00557d657c8773ef9ee702c61dd13b9d7426794c9dfeb1dc4a0bf0ebc7"}, + {file = "pandas-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:06cf591dbaefb6da9de8472535b185cba556d0ce2e6ed28e21d919704fef1a9e"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:88ecb5c01bb9ca927ebc4098136038519aa5d66b44671861ffab754cae75102c"}, + {file = "pandas-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f6ec3baec203c13e3f8b139fb0f9f86cd8c0b94603ae3ae8ce9a422e9f5bee"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a935a90a76c44fe170d01e90a3594beef9e9a6220021acfb26053d01426f7dc2"}, + {file = "pandas-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c391f594aae2fd9f679d419e9a4d5ba4bce5bb13f6a989195656e7dc4b95c8f0"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9d1265545f579edf3f8f0cb6f89f234f5e44ba725a34d86535b1a1d38decbccc"}, + {file = "pandas-2.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11940e9e3056576ac3244baef2fedade891977bcc1cb7e5cc8f8cc7d603edc89"}, + {file = "pandas-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:4acf681325ee1c7f950d058b05a820441075b0dd9a2adf5c4835b9bc056bf4fb"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9bd8a40f47080825af4317d0340c656744f2bfdb6819f818e6ba3cd24c0e1397"}, + {file = "pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:df0c37ebd19e11d089ceba66eba59a168242fc6b7155cba4ffffa6eccdfb8f16"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:739cc70eaf17d57608639e74d63387b0d8594ce02f69e7a0b046f117974b3019"}, + {file = "pandas-2.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9d3558d263073ed95e46f4650becff0c5e1ffe0fc3a015de3c79283dfbdb3df"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4aa1d8707812a658debf03824016bf5ea0d516afdea29b7dc14cf687bc4d4ec6"}, + {file = "pandas-2.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:76f27a809cda87e07f192f001d11adc2b930e93a2b0c4a236fde5429527423be"}, + {file = "pandas-2.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:1ba21b1d5c0e43416218db63037dbe1a01fc101dc6e6024bcad08123e48004ab"}, + {file = "pandas-2.2.1.tar.gz", hash = "sha256:0ab90f87093c13f3e8fa45b48ba9f39181046e8f3317d3aadb2fffbb1b978572"}, ] [package.dependencies] @@ -2590,6 +2664,7 @@ parquet = ["pyarrow (>=10.0.1)"] performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] plot = ["matplotlib (>=3.6.3)"] postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] spss = ["pyreadstat (>=1.2.0)"] sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] @@ -2742,13 +2817,13 @@ tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "p [[package]] name = "posthog" -version = "3.4.2" +version = "3.5.0" description = "Integrate PostHog into any python application." optional = false python-versions = "*" files = [ - {file = "posthog-3.4.2-py2.py3-none-any.whl", hash = "sha256:c7e79b2e585d16e93749874bcbcdad78d857037398ce0d8d6c474a04d0bd3bbe"}, - {file = "posthog-3.4.2.tar.gz", hash = "sha256:f0eafa663fbc4a942b49b6168a62a890635407044bbc7593051dcb9cc1208873"}, + {file = "posthog-3.5.0-py2.py3-none-any.whl", hash = "sha256:3c672be7ba6f95d555ea207d4486c171d06657eb34b3ce25eb043bfe7b6b5b76"}, + {file = "posthog-3.5.0.tar.gz", hash = "sha256:8f7e3b2c6e8714d0c0c542a2109b83a7549f63b7113a133ab2763a89245ef2ef"}, ] [package.dependencies] @@ -2832,47 +2907,47 @@ functions = ["apache-bookkeeper-client (>=4.16.1)", "grpcio (>=1.60.0)", "promet [[package]] name = "pyarrow" -version = "15.0.0" +version = "15.0.1" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" files = [ - {file = "pyarrow-15.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:0a524532fd6dd482edaa563b686d754c70417c2f72742a8c990b322d4c03a15d"}, - {file = "pyarrow-15.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a6bdb314affa9c2e0d5dddf3d9cbb9ef4a8dddaa68669975287d47ece67642"}, - {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66958fd1771a4d4b754cd385835e66a3ef6b12611e001d4e5edfcef5f30391e2"}, - {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f500956a49aadd907eaa21d4fff75f73954605eaa41f61cb94fb008cf2e00c6"}, - {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6f87d9c4f09e049c2cade559643424da84c43a35068f2a1c4653dc5b1408a929"}, - {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85239b9f93278e130d86c0e6bb455dcb66fc3fd891398b9d45ace8799a871a1e"}, - {file = "pyarrow-15.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5b8d43e31ca16aa6e12402fcb1e14352d0d809de70edd185c7650fe80e0769e3"}, - {file = "pyarrow-15.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:fa7cd198280dbd0c988df525e50e35b5d16873e2cdae2aaaa6363cdb64e3eec5"}, - {file = "pyarrow-15.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8780b1a29d3c8b21ba6b191305a2a607de2e30dab399776ff0aa09131e266340"}, - {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0ec198ccc680f6c92723fadcb97b74f07c45ff3fdec9dd765deb04955ccf19"}, - {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036a7209c235588c2f07477fe75c07e6caced9b7b61bb897c8d4e52c4b5f9555"}, - {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2bd8a0e5296797faf9a3294e9fa2dc67aa7f10ae2207920dbebb785c77e9dbe5"}, - {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e8ebed6053dbe76883a822d4e8da36860f479d55a762bd9e70d8494aed87113e"}, - {file = "pyarrow-15.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:17d53a9d1b2b5bd7d5e4cd84d018e2a45bc9baaa68f7e6e3ebed45649900ba99"}, - {file = "pyarrow-15.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9950a9c9df24090d3d558b43b97753b8f5867fb8e521f29876aa021c52fda351"}, - {file = "pyarrow-15.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:003d680b5e422d0204e7287bb3fa775b332b3fce2996aa69e9adea23f5c8f970"}, - {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f75fce89dad10c95f4bf590b765e3ae98bcc5ba9f6ce75adb828a334e26a3d40"}, - {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca9cb0039923bec49b4fe23803807e4ef39576a2bec59c32b11296464623dc2"}, - {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ed5a78ed29d171d0acc26a305a4b7f83c122d54ff5270810ac23c75813585e4"}, - {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6eda9e117f0402dfcd3cd6ec9bfee89ac5071c48fc83a84f3075b60efa96747f"}, - {file = "pyarrow-15.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a3a6180c0e8f2727e6f1b1c87c72d3254cac909e609f35f22532e4115461177"}, - {file = "pyarrow-15.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:19a8918045993349b207de72d4576af0191beef03ea655d8bdb13762f0cd6eac"}, - {file = "pyarrow-15.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d0ec076b32bacb6666e8813a22e6e5a7ef1314c8069d4ff345efa6246bc38593"}, - {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5db1769e5d0a77eb92344c7382d6543bea1164cca3704f84aa44e26c67e320fb"}, - {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2617e3bf9df2a00020dd1c1c6dce5cc343d979efe10bc401c0632b0eef6ef5b"}, - {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:d31c1d45060180131caf10f0f698e3a782db333a422038bf7fe01dace18b3a31"}, - {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:c8c287d1d479de8269398b34282e206844abb3208224dbdd7166d580804674b7"}, - {file = "pyarrow-15.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:07eb7f07dc9ecbb8dace0f58f009d3a29ee58682fcdc91337dfeb51ea618a75b"}, - {file = "pyarrow-15.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:47af7036f64fce990bb8a5948c04722e4e3ea3e13b1007ef52dfe0aa8f23cf7f"}, - {file = "pyarrow-15.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93768ccfff85cf044c418bfeeafce9a8bb0cee091bd8fd19011aff91e58de540"}, - {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6ee87fd6892700960d90abb7b17a72a5abb3b64ee0fe8db6c782bcc2d0dc0b4"}, - {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:001fca027738c5f6be0b7a3159cc7ba16a5c52486db18160909a0831b063c4e4"}, - {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:d1c48648f64aec09accf44140dccb92f4f94394b8d79976c426a5b79b11d4fa7"}, - {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:972a0141be402bb18e3201448c8ae62958c9c7923dfaa3b3d4530c835ac81aed"}, - {file = "pyarrow-15.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:f01fc5cf49081426429127aa2d427d9d98e1cb94a32cb961d583a70b7c4504e6"}, - {file = "pyarrow-15.0.0.tar.gz", hash = "sha256:876858f549d540898f927eba4ef77cd549ad8d24baa3207cf1b72e5788b50e83"}, + {file = "pyarrow-15.0.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:c2ddb3be5ea938c329a84171694fc230b241ce1b6b0ff1a0280509af51c375fa"}, + {file = "pyarrow-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7543ea88a0ff72f8e6baaf9bfdbec2c62aeabdbede9e4a571c71cc3bc43b6302"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1519e218a6941fc074e4501088d891afcb2adf77c236e03c34babcf3d6a0d1c7"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28cafa86e1944761970d3b3fc0411b14ff9b5c2b73cd22aaf470d7a3976335f5"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:be5c3d463e33d03eab496e1af7916b1d44001c08f0f458ad27dc16093a020638"}, + {file = "pyarrow-15.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:47b1eda15d3aa3f49a07b1808648e1397e5dc6a80a30bf87faa8e2d02dad7ac3"}, + {file = "pyarrow-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e524a31be7db22deebbbcf242b189063ab9a7652c62471d296b31bc6e3cae77b"}, + {file = "pyarrow-15.0.1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:a476fefe8bdd56122fb0d4881b785413e025858803cc1302d0d788d3522b374d"}, + {file = "pyarrow-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:309e6191be385f2e220586bfdb643f9bb21d7e1bc6dd0a6963dc538e347b2431"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83bc586903dbeb4365cbc72b602f99f70b96c5882e5dfac5278813c7d624ca3c"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:07e652daac6d8b05280cd2af31c0fb61a4490ec6a53dc01588014d9fa3fdbee9"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:abad2e08652df153a72177ce20c897d083b0c4ebeec051239e2654ddf4d3c996"}, + {file = "pyarrow-15.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cde663352bc83ad75ba7b3206e049ca1a69809223942362a8649e37bd22f9e3b"}, + {file = "pyarrow-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1b6e237dd7a08482a8b8f3f6512d258d2460f182931832a8c6ef3953203d31e1"}, + {file = "pyarrow-15.0.1-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:7bd167536ee23192760b8c731d39b7cfd37914c27fd4582335ffd08450ff799d"}, + {file = "pyarrow-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c08bb31eb2984ba5c3747d375bb522e7e536b8b25b149c9cb5e1c49b0ccb736"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0f9c1d630ed2524bd1ddf28ec92780a7b599fd54704cd653519f7ff5aec177a"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5186048493395220550bca7b524420471aac2d77af831f584ce132680f55c3df"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:31dc30c7ec8958da3a3d9f31d6c3630429b2091ede0ecd0d989fd6bec129f0e4"}, + {file = "pyarrow-15.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:3f111a014fb8ac2297b43a74bf4495cc479a332908f7ee49cb7cbd50714cb0c1"}, + {file = "pyarrow-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a6d1f7c15d7f68f08490d0cb34611497c74285b8a6bbeab4ef3fc20117310983"}, + {file = "pyarrow-15.0.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:9ad931b996f51c2f978ed517b55cb3c6078272fb4ec579e3da5a8c14873b698d"}, + {file = "pyarrow-15.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:738f6b53ab1c2f66b2bde8a1d77e186aeaab702d849e0dfa1158c9e2c030add3"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c1c3fc16bc74e33bf8f1e5a212938ed8d88e902f372c4dac6b5bad328567d2f"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1fa92512128f6c1b8dde0468c1454dd70f3bff623970e370d52efd4d24fd0be"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:b4157f307c202cbbdac147d9b07447a281fa8e63494f7fc85081da351ec6ace9"}, + {file = "pyarrow-15.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:b75e7da26f383787f80ad76143b44844ffa28648fcc7099a83df1538c078d2f2"}, + {file = "pyarrow-15.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:3a99eac76ae14096c209850935057b9e8ce97a78397c5cde8724674774f34e5d"}, + {file = "pyarrow-15.0.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:dd532d3177e031e9b2d2df19fd003d0cc0520d1747659fcabbd4d9bb87de508c"}, + {file = "pyarrow-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ce8c89848fd37e5313fc2ce601483038ee5566db96ba0808d5883b2e2e55dc53"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:862eac5e5f3b6477f7a92b2f27e560e1f4e5e9edfca9ea9da8a7478bb4abd5ce"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f0ea3a29cd5cb99bf14c1c4533eceaa00ea8fb580950fb5a89a5c771a994a4e"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:bb902f780cfd624b2e8fd8501fadab17618fdb548532620ef3d91312aaf0888a"}, + {file = "pyarrow-15.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4f87757f02735a6bb4ad2e1b98279ac45d53b748d5baf52401516413007c6999"}, + {file = "pyarrow-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:efd3816c7fbfcbd406ac0f69873cebb052effd7cdc153ae5836d1b00845845d7"}, + {file = "pyarrow-15.0.1.tar.gz", hash = "sha256:21d812548d39d490e0c6928a7c663f37b96bf764034123d4b4ab4530ecc757a9"}, ] [package.dependencies] @@ -2916,18 +2991,18 @@ pyasn1 = ">=0.4.6,<0.6.0" [[package]] name = "pydantic" -version = "2.6.1" +version = "2.6.3" description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic-2.6.1-py3-none-any.whl", hash = "sha256:0b6a909df3192245cb736509a92ff69e4fef76116feffec68e93a567347bae6f"}, - {file = "pydantic-2.6.1.tar.gz", hash = "sha256:4fd5c182a2488dc63e6d32737ff19937888001e2a6d86e94b3f233104a5d1fa9"}, + {file = "pydantic-2.6.3-py3-none-any.whl", hash = "sha256:72c6034df47f46ccdf81869fddb81aade68056003900a8724a4f160700016a2a"}, + {file = "pydantic-2.6.3.tar.gz", hash = "sha256:e07805c4c7f5c6826e33a1d4c9d47950d7eaf34868e2690f8594d2e30241f11f"}, ] [package.dependencies] annotated-types = ">=0.4.0" -pydantic-core = "2.16.2" +pydantic-core = "2.16.3" typing-extensions = ">=4.6.1" [package.extras] @@ -2935,90 +3010,90 @@ email = ["email-validator (>=2.0.0)"] [[package]] name = "pydantic-core" -version = "2.16.2" +version = "2.16.3" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_core-2.16.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3fab4e75b8c525a4776e7630b9ee48aea50107fea6ca9f593c98da3f4d11bf7c"}, - {file = "pydantic_core-2.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8bde5b48c65b8e807409e6f20baee5d2cd880e0fad00b1a811ebc43e39a00ab2"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2924b89b16420712e9bb8192396026a8fbd6d8726224f918353ac19c4c043d2a"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:16aa02e7a0f539098e215fc193c8926c897175d64c7926d00a36188917717a05"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:936a787f83db1f2115ee829dd615c4f684ee48ac4de5779ab4300994d8af325b"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:459d6be6134ce3b38e0ef76f8a672924460c455d45f1ad8fdade36796df1ddc8"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f9ee4febb249c591d07b2d4dd36ebcad0ccd128962aaa1801508320896575ef"}, - {file = "pydantic_core-2.16.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:40a0bd0bed96dae5712dab2aba7d334a6c67cbcac2ddfca7dbcc4a8176445990"}, - {file = "pydantic_core-2.16.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:870dbfa94de9b8866b37b867a2cb37a60c401d9deb4a9ea392abf11a1f98037b"}, - {file = "pydantic_core-2.16.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:308974fdf98046db28440eb3377abba274808bf66262e042c412eb2adf852731"}, - {file = "pydantic_core-2.16.2-cp310-none-win32.whl", hash = "sha256:a477932664d9611d7a0816cc3c0eb1f8856f8a42435488280dfbf4395e141485"}, - {file = "pydantic_core-2.16.2-cp310-none-win_amd64.whl", hash = "sha256:8f9142a6ed83d90c94a3efd7af8873bf7cefed2d3d44387bf848888482e2d25f"}, - {file = "pydantic_core-2.16.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:406fac1d09edc613020ce9cf3f2ccf1a1b2f57ab00552b4c18e3d5276c67eb11"}, - {file = "pydantic_core-2.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce232a6170dd6532096cadbf6185271e4e8c70fc9217ebe105923ac105da9978"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a90fec23b4b05a09ad988e7a4f4e081711a90eb2a55b9c984d8b74597599180f"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8aafeedb6597a163a9c9727d8a8bd363a93277701b7bfd2749fbefee2396469e"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9957433c3a1b67bdd4c63717eaf174ebb749510d5ea612cd4e83f2d9142f3fc8"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0d7a9165167269758145756db43a133608a531b1e5bb6a626b9ee24bc38a8f7"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dffaf740fe2e147fedcb6b561353a16243e654f7fe8e701b1b9db148242e1272"}, - {file = "pydantic_core-2.16.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f8ed79883b4328b7f0bd142733d99c8e6b22703e908ec63d930b06be3a0e7113"}, - {file = "pydantic_core-2.16.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:cf903310a34e14651c9de056fcc12ce090560864d5a2bb0174b971685684e1d8"}, - {file = "pydantic_core-2.16.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:46b0d5520dbcafea9a8645a8164658777686c5c524d381d983317d29687cce97"}, - {file = "pydantic_core-2.16.2-cp311-none-win32.whl", hash = "sha256:70651ff6e663428cea902dac297066d5c6e5423fda345a4ca62430575364d62b"}, - {file = "pydantic_core-2.16.2-cp311-none-win_amd64.whl", hash = "sha256:98dc6f4f2095fc7ad277782a7c2c88296badcad92316b5a6e530930b1d475ebc"}, - {file = "pydantic_core-2.16.2-cp311-none-win_arm64.whl", hash = "sha256:ef6113cd31411eaf9b39fc5a8848e71c72656fd418882488598758b2c8c6dfa0"}, - {file = "pydantic_core-2.16.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:88646cae28eb1dd5cd1e09605680c2b043b64d7481cdad7f5003ebef401a3039"}, - {file = "pydantic_core-2.16.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7b883af50eaa6bb3299780651e5be921e88050ccf00e3e583b1e92020333304b"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bf26c2e2ea59d32807081ad51968133af3025c4ba5753e6a794683d2c91bf6e"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:99af961d72ac731aae2a1b55ccbdae0733d816f8bfb97b41909e143de735f522"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02906e7306cb8c5901a1feb61f9ab5e5c690dbbeaa04d84c1b9ae2a01ebe9379"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5362d099c244a2d2f9659fb3c9db7c735f0004765bbe06b99be69fbd87c3f15"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ac426704840877a285d03a445e162eb258924f014e2f074e209d9b4ff7bf380"}, - {file = "pydantic_core-2.16.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b94cbda27267423411c928208e89adddf2ea5dd5f74b9528513f0358bba019cb"}, - {file = "pydantic_core-2.16.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6db58c22ac6c81aeac33912fb1af0e930bc9774166cdd56eade913d5f2fff35e"}, - {file = "pydantic_core-2.16.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:396fdf88b1b503c9c59c84a08b6833ec0c3b5ad1a83230252a9e17b7dfb4cffc"}, - {file = "pydantic_core-2.16.2-cp312-none-win32.whl", hash = "sha256:7c31669e0c8cc68400ef0c730c3a1e11317ba76b892deeefaf52dcb41d56ed5d"}, - {file = "pydantic_core-2.16.2-cp312-none-win_amd64.whl", hash = "sha256:a3b7352b48fbc8b446b75f3069124e87f599d25afb8baa96a550256c031bb890"}, - {file = "pydantic_core-2.16.2-cp312-none-win_arm64.whl", hash = "sha256:a9e523474998fb33f7c1a4d55f5504c908d57add624599e095c20fa575b8d943"}, - {file = "pydantic_core-2.16.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:ae34418b6b389d601b31153b84dce480351a352e0bb763684a1b993d6be30f17"}, - {file = "pydantic_core-2.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:732bd062c9e5d9582a30e8751461c1917dd1ccbdd6cafb032f02c86b20d2e7ec"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b52776a2e3230f4854907a1e0946eec04d41b1fc64069ee774876bbe0eab55"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ef551c053692b1e39e3f7950ce2296536728871110e7d75c4e7753fb30ca87f4"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ebb892ed8599b23fa8f1799e13a12c87a97a6c9d0f497525ce9858564c4575a4"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa6c8c582036275997a733427b88031a32ffa5dfc3124dc25a730658c47a572f"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4ba0884a91f1aecce75202473ab138724aa4fb26d7707f2e1fa6c3e68c84fbf"}, - {file = "pydantic_core-2.16.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7924e54f7ce5d253d6160090ddc6df25ed2feea25bfb3339b424a9dd591688bc"}, - {file = "pydantic_core-2.16.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:69a7b96b59322a81c2203be537957313b07dd333105b73db0b69212c7d867b4b"}, - {file = "pydantic_core-2.16.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7e6231aa5bdacda78e96ad7b07d0c312f34ba35d717115f4b4bff6cb87224f0f"}, - {file = "pydantic_core-2.16.2-cp38-none-win32.whl", hash = "sha256:41dac3b9fce187a25c6253ec79a3f9e2a7e761eb08690e90415069ea4a68ff7a"}, - {file = "pydantic_core-2.16.2-cp38-none-win_amd64.whl", hash = "sha256:f685dbc1fdadb1dcd5b5e51e0a378d4685a891b2ddaf8e2bba89bd3a7144e44a"}, - {file = "pydantic_core-2.16.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:55749f745ebf154c0d63d46c8c58594d8894b161928aa41adbb0709c1fe78b77"}, - {file = "pydantic_core-2.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b30b0dd58a4509c3bd7eefddf6338565c4905406aee0c6e4a5293841411a1286"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18de31781cdc7e7b28678df7c2d7882f9692ad060bc6ee3c94eb15a5d733f8f7"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5864b0242f74b9dd0b78fd39db1768bc3f00d1ffc14e596fd3e3f2ce43436a33"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8f9186ca45aee030dc8234118b9c0784ad91a0bb27fc4e7d9d6608a5e3d386c"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cc6f6c9be0ab6da37bc77c2dda5f14b1d532d5dbef00311ee6e13357a418e646"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa057095f621dad24a1e906747179a69780ef45cc8f69e97463692adbcdae878"}, - {file = "pydantic_core-2.16.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ad84731a26bcfb299f9eab56c7932d46f9cad51c52768cace09e92a19e4cf55"}, - {file = "pydantic_core-2.16.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3b052c753c4babf2d1edc034c97851f867c87d6f3ea63a12e2700f159f5c41c3"}, - {file = "pydantic_core-2.16.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e0f686549e32ccdb02ae6f25eee40cc33900910085de6aa3790effd391ae10c2"}, - {file = "pydantic_core-2.16.2-cp39-none-win32.whl", hash = "sha256:7afb844041e707ac9ad9acad2188a90bffce2c770e6dc2318be0c9916aef1469"}, - {file = "pydantic_core-2.16.2-cp39-none-win_amd64.whl", hash = "sha256:9da90d393a8227d717c19f5397688a38635afec89f2e2d7af0df037f3249c39a"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f60f920691a620b03082692c378661947d09415743e437a7478c309eb0e4f82"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:47924039e785a04d4a4fa49455e51b4eb3422d6eaacfde9fc9abf8fdef164e8a"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6294e76b0380bb7a61eb8a39273c40b20beb35e8c87ee101062834ced19c545"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe56851c3f1d6f5384b3051c536cc81b3a93a73faf931f404fef95217cf1e10d"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9d776d30cde7e541b8180103c3f294ef7c1862fd45d81738d156d00551005784"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:72f7919af5de5ecfaf1eba47bf9a5d8aa089a3340277276e5636d16ee97614d7"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:4bfcbde6e06c56b30668a0c872d75a7ef3025dc3c1823a13cf29a0e9b33f67e8"}, - {file = "pydantic_core-2.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ff7c97eb7a29aba230389a2661edf2e9e06ce616c7e35aa764879b6894a44b25"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9b5f13857da99325dcabe1cc4e9e6a3d7b2e2c726248ba5dd4be3e8e4a0b6d0e"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a7e41e3ada4cca5f22b478c08e973c930e5e6c7ba3588fb8e35f2398cdcc1545"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60eb8ceaa40a41540b9acae6ae7c1f0a67d233c40dc4359c256ad2ad85bdf5e5"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7beec26729d496a12fd23cf8da9944ee338c8b8a17035a560b585c36fe81af20"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:22c5f022799f3cd6741e24f0443ead92ef42be93ffda0d29b2597208c94c3753"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:eca58e319f4fd6df004762419612122b2c7e7d95ffafc37e890252f869f3fb2a"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ed957db4c33bc99895f3a1672eca7e80e8cda8bd1e29a80536b4ec2153fa9804"}, - {file = "pydantic_core-2.16.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:459c0d338cc55d099798618f714b21b7ece17eb1a87879f2da20a3ff4c7628e2"}, - {file = "pydantic_core-2.16.2.tar.gz", hash = "sha256:0ba503850d8b8dcc18391f10de896ae51d37fe5fe43dbfb6a35c5c5cad271a06"}, + {file = "pydantic_core-2.16.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:75b81e678d1c1ede0785c7f46690621e4c6e63ccd9192af1f0bd9d504bbb6bf4"}, + {file = "pydantic_core-2.16.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c865a7ee6f93783bd5d781af5a4c43dadc37053a5b42f7d18dc019f8c9d2bd1"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:162e498303d2b1c036b957a1278fa0899d02b2842f1ff901b6395104c5554a45"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f583bd01bbfbff4eaee0868e6fc607efdfcc2b03c1c766b06a707abbc856187"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b926dd38db1519ed3043a4de50214e0d600d404099c3392f098a7f9d75029ff8"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:716b542728d4c742353448765aa7cdaa519a7b82f9564130e2b3f6766018c9ec"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ad7f7ee1a13d9cb49d8198cd7d7e3aa93e425f371a68235f784e99741561f"}, + {file = "pydantic_core-2.16.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bd87f48924f360e5d1c5f770d6155ce0e7d83f7b4e10c2f9ec001c73cf475c99"}, + {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0df446663464884297c793874573549229f9eca73b59360878f382a0fc085979"}, + {file = "pydantic_core-2.16.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4df8a199d9f6afc5ae9a65f8f95ee52cae389a8c6b20163762bde0426275b7db"}, + {file = "pydantic_core-2.16.3-cp310-none-win32.whl", hash = "sha256:456855f57b413f077dff513a5a28ed838dbbb15082ba00f80750377eed23d132"}, + {file = "pydantic_core-2.16.3-cp310-none-win_amd64.whl", hash = "sha256:732da3243e1b8d3eab8c6ae23ae6a58548849d2e4a4e03a1924c8ddf71a387cb"}, + {file = "pydantic_core-2.16.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:519ae0312616026bf4cedc0fe459e982734f3ca82ee8c7246c19b650b60a5ee4"}, + {file = "pydantic_core-2.16.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b3992a322a5617ded0a9f23fd06dbc1e4bd7cf39bc4ccf344b10f80af58beacd"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d62da299c6ecb04df729e4b5c52dc0d53f4f8430b4492b93aa8de1f541c4aac"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2acca2be4bb2f2147ada8cac612f8a98fc09f41c89f87add7256ad27332c2fda"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1b662180108c55dfbf1280d865b2d116633d436cfc0bba82323554873967b340"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7c6ed0dc9d8e65f24f5824291550139fe6f37fac03788d4580da0d33bc00c97"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b1bb0827f56654b4437955555dc3aeeebeddc47c2d7ed575477f082622c49e"}, + {file = "pydantic_core-2.16.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e56f8186d6210ac7ece503193ec84104da7ceb98f68ce18c07282fcc2452e76f"}, + {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:936e5db01dd49476fa8f4383c259b8b1303d5dd5fb34c97de194560698cc2c5e"}, + {file = "pydantic_core-2.16.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:33809aebac276089b78db106ee692bdc9044710e26f24a9a2eaa35a0f9fa70ba"}, + {file = "pydantic_core-2.16.3-cp311-none-win32.whl", hash = "sha256:ded1c35f15c9dea16ead9bffcde9bb5c7c031bff076355dc58dcb1cb436c4721"}, + {file = "pydantic_core-2.16.3-cp311-none-win_amd64.whl", hash = "sha256:d89ca19cdd0dd5f31606a9329e309d4fcbb3df860960acec32630297d61820df"}, + {file = "pydantic_core-2.16.3-cp311-none-win_arm64.whl", hash = "sha256:6162f8d2dc27ba21027f261e4fa26f8bcb3cf9784b7f9499466a311ac284b5b9"}, + {file = "pydantic_core-2.16.3-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0f56ae86b60ea987ae8bcd6654a887238fd53d1384f9b222ac457070b7ac4cff"}, + {file = "pydantic_core-2.16.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c9bd22a2a639e26171068f8ebb5400ce2c1bc7d17959f60a3b753ae13c632975"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4204e773b4b408062960e65468d5346bdfe139247ee5f1ca2a378983e11388a2"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f651dd19363c632f4abe3480a7c87a9773be27cfe1341aef06e8759599454120"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aaf09e615a0bf98d406657e0008e4a8701b11481840be7d31755dc9f97c44053"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8e47755d8152c1ab5b55928ab422a76e2e7b22b5ed8e90a7d584268dd49e9c6b"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:500960cb3a0543a724a81ba859da816e8cf01b0e6aaeedf2c3775d12ee49cade"}, + {file = "pydantic_core-2.16.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf6204fe865da605285c34cf1172879d0314ff267b1c35ff59de7154f35fdc2e"}, + {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d33dd21f572545649f90c38c227cc8631268ba25c460b5569abebdd0ec5974ca"}, + {file = "pydantic_core-2.16.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:49d5d58abd4b83fb8ce763be7794d09b2f50f10aa65c0f0c1696c677edeb7cbf"}, + {file = "pydantic_core-2.16.3-cp312-none-win32.whl", hash = "sha256:f53aace168a2a10582e570b7736cc5bef12cae9cf21775e3eafac597e8551fbe"}, + {file = "pydantic_core-2.16.3-cp312-none-win_amd64.whl", hash = "sha256:0d32576b1de5a30d9a97f300cc6a3f4694c428d956adbc7e6e2f9cad279e45ed"}, + {file = "pydantic_core-2.16.3-cp312-none-win_arm64.whl", hash = "sha256:ec08be75bb268473677edb83ba71e7e74b43c008e4a7b1907c6d57e940bf34b6"}, + {file = "pydantic_core-2.16.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:b1f6f5938d63c6139860f044e2538baeee6f0b251a1816e7adb6cbce106a1f01"}, + {file = "pydantic_core-2.16.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2a1ef6a36fdbf71538142ed604ad19b82f67b05749512e47f247a6ddd06afdc7"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704d35ecc7e9c31d48926150afada60401c55efa3b46cd1ded5a01bdffaf1d48"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d937653a696465677ed583124b94a4b2d79f5e30b2c46115a68e482c6a591c8a"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9803edf8e29bd825f43481f19c37f50d2b01899448273b3a7758441b512acf8"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:72282ad4892a9fb2da25defeac8c2e84352c108705c972db82ab121d15f14e6d"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f752826b5b8361193df55afcdf8ca6a57d0232653494ba473630a83ba50d8c9"}, + {file = "pydantic_core-2.16.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4384a8f68ddb31a0b0c3deae88765f5868a1b9148939c3f4121233314ad5532c"}, + {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a4b2bf78342c40b3dc830880106f54328928ff03e357935ad26c7128bbd66ce8"}, + {file = "pydantic_core-2.16.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:13dcc4802961b5f843a9385fc821a0b0135e8c07fc3d9949fd49627c1a5e6ae5"}, + {file = "pydantic_core-2.16.3-cp38-none-win32.whl", hash = "sha256:e3e70c94a0c3841e6aa831edab1619ad5c511199be94d0c11ba75fe06efe107a"}, + {file = "pydantic_core-2.16.3-cp38-none-win_amd64.whl", hash = "sha256:ecdf6bf5f578615f2e985a5e1f6572e23aa632c4bd1dc67f8f406d445ac115ed"}, + {file = "pydantic_core-2.16.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:bda1ee3e08252b8d41fa5537413ffdddd58fa73107171a126d3b9ff001b9b820"}, + {file = "pydantic_core-2.16.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:21b888c973e4f26b7a96491c0965a8a312e13be108022ee510248fe379a5fa23"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be0ec334369316fa73448cc8c982c01e5d2a81c95969d58b8f6e272884df0074"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b5b6079cc452a7c53dd378c6f881ac528246b3ac9aae0f8eef98498a75657805"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ee8d5f878dccb6d499ba4d30d757111847b6849ae07acdd1205fffa1fc1253c"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7233d65d9d651242a68801159763d09e9ec96e8a158dbf118dc090cd77a104c9"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6119dc90483a5cb50a1306adb8d52c66e447da88ea44f323e0ae1a5fcb14256"}, + {file = "pydantic_core-2.16.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:578114bc803a4c1ff9946d977c221e4376620a46cf78da267d946397dc9514a8"}, + {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d8f99b147ff3fcf6b3cc60cb0c39ea443884d5559a30b1481e92495f2310ff2b"}, + {file = "pydantic_core-2.16.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4ac6b4ce1e7283d715c4b729d8f9dab9627586dafce81d9eaa009dd7f25dd972"}, + {file = "pydantic_core-2.16.3-cp39-none-win32.whl", hash = "sha256:e7774b570e61cb998490c5235740d475413a1f6de823169b4cf94e2fe9e9f6b2"}, + {file = "pydantic_core-2.16.3-cp39-none-win_amd64.whl", hash = "sha256:9091632a25b8b87b9a605ec0e61f241c456e9248bfdcf7abdf344fdb169c81cf"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:36fa178aacbc277bc6b62a2c3da95226520da4f4e9e206fdf076484363895d2c"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:dcca5d2bf65c6fb591fff92da03f94cd4f315972f97c21975398bd4bd046854a"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2a72fb9963cba4cd5793854fd12f4cfee731e86df140f59ff52a49b3552db241"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b60cc1a081f80a2105a59385b92d82278b15d80ebb3adb200542ae165cd7d183"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cbcc558401de90a746d02ef330c528f2e668c83350f045833543cd57ecead1ad"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:fee427241c2d9fb7192b658190f9f5fd6dfe41e02f3c1489d2ec1e6a5ab1e04a"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f4cb85f693044e0f71f394ff76c98ddc1bc0953e48c061725e540396d5c8a2e1"}, + {file = "pydantic_core-2.16.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b29eeb887aa931c2fcef5aa515d9d176d25006794610c264ddc114c053bf96fe"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a425479ee40ff021f8216c9d07a6a3b54b31c8267c6e17aa88b70d7ebd0e5e5b"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:5c5cbc703168d1b7a838668998308018a2718c2130595e8e190220238addc96f"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99b6add4c0b39a513d323d3b93bc173dac663c27b99860dd5bf491b240d26137"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f76ee558751746d6a38f89d60b6228fa174e5172d143886af0f85aa306fd89"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:00ee1c97b5364b84cb0bd82e9bbf645d5e2871fb8c58059d158412fee2d33d8a"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:287073c66748f624be4cef893ef9174e3eb88fe0b8a78dc22e88eca4bc357ca6"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ed25e1835c00a332cb10c683cd39da96a719ab1dfc08427d476bce41b92531fc"}, + {file = "pydantic_core-2.16.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:86b3d0033580bd6bbe07590152007275bd7af95f98eaa5bd36f3da219dcd93da"}, + {file = "pydantic_core-2.16.3.tar.gz", hash = "sha256:1cac689f80a3abab2d3c0048b29eea5751114054f032a941a32de4c852c59cad"}, ] [package.dependencies] @@ -3041,42 +3116,42 @@ windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pymupdf" -version = "1.23.25" +version = "1.23.26" description = "A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents." optional = false python-versions = ">=3.8" files = [ - {file = "PyMuPDF-1.23.25-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:6be2b20fbff40602f673fc8e60fde3e5911397f8ca9ed6aa2d15be94b12cc2c4"}, - {file = "PyMuPDF-1.23.25-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:0f6923a44fbeaeefaabb2fa10955dcef3624e8826db661201951f3b3409fed32"}, - {file = "PyMuPDF-1.23.25-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:8eeb2e97347586ec293fddaf61e8dfc58d6b2763406e8f7a6e45b560bf9b15a3"}, - {file = "PyMuPDF-1.23.25-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:dca46799c152051697c5e88d66c17ba6d0244668d0c4dd8a2ba2d8d3cb745988"}, - {file = "PyMuPDF-1.23.25-cp310-none-win32.whl", hash = "sha256:88bfed1bd13ec84869489fc7b97381016cb8b99956073f4c3e8ac8c840bbb15a"}, - {file = "PyMuPDF-1.23.25-cp310-none-win_amd64.whl", hash = "sha256:98a78582c8a0c61b372e2bcd63dc61efc873e40b7d1f0b896a195e1a9ef9ffa7"}, - {file = "PyMuPDF-1.23.25-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:d7792810634036a745ea3eb3c4ccf2b6adab55ca9644e3352747d2b5aa5327f9"}, - {file = "PyMuPDF-1.23.25-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:03bd1985b0234c3d2b8e26bb3e9ab1d2641dbada1e199b838a6bf884f35224c8"}, - {file = "PyMuPDF-1.23.25-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:638fcb1f7551eb5ab582e412e204e8ded94acbbc37bc7f1e891a5dfc428881ee"}, - {file = "PyMuPDF-1.23.25-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:067c88b4e6609cb7e74d98d0b0a35c11eb8e29f4fc51dc7ed1dd448b81d347c7"}, - {file = "PyMuPDF-1.23.25-cp311-none-win32.whl", hash = "sha256:a694f160d1701285cf3152951430740878d168511cd9ea0a3adcfaf3cac00322"}, - {file = "PyMuPDF-1.23.25-cp311-none-win_amd64.whl", hash = "sha256:514bcb679926b33413637b0bd73b223c90fb0d19352caf3395d0f23b1d47e8af"}, - {file = "PyMuPDF-1.23.25-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:bba342321e1b5574631894d7d34ec046605d953a23553b7d2f9c0e4d3c27254b"}, - {file = "PyMuPDF-1.23.25-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:b2cb058c8229f9697deebe0574f7d95e4b9a5e295ceafd554346bbd464141e89"}, - {file = "PyMuPDF-1.23.25-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:2479473b533936593428ce78499a1e9901570110ac602f03f1f3174efa0fa6a8"}, - {file = "PyMuPDF-1.23.25-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:a247a4be1e43a6127ee305eae9f65767ee7519a2aa0cb1a2aa6acfd4e7fe7a9b"}, - {file = "PyMuPDF-1.23.25-cp312-none-win32.whl", hash = "sha256:b062be400bbaff6e8b17c0a8da9481e01ec935f97967e0870e9aacd7ba60a52a"}, - {file = "PyMuPDF-1.23.25-cp312-none-win_amd64.whl", hash = "sha256:b12e608761e1586a65f6e96a34417a91f814dbab29f2929b41d825ab32fab6ef"}, - {file = "PyMuPDF-1.23.25-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:ac97691c0e0e23607626d394bd660a46ea33f64921dc9288cf24daee207f9fe3"}, - {file = "PyMuPDF-1.23.25-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:c0a16cda5dc9b59d494ae23bdd9c4a3db53d04f2b6390265f5c0fe6269777975"}, - {file = "PyMuPDF-1.23.25-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:23d735db51722a889bb50636d161d2747f08fa0b82cc2e4a7eb8e228b25d1c4e"}, - {file = "PyMuPDF-1.23.25-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:cbc1407dcf01b2e3e547b2d7643b97cc44c0950d2bb4b12c74322664c5cb37d7"}, - {file = "PyMuPDF-1.23.25-cp38-none-win32.whl", hash = "sha256:c29518701d6360beb01c25cf69a77b6426db90a9e7cd11179b3bd783c7fb4cb1"}, - {file = "PyMuPDF-1.23.25-cp38-none-win_amd64.whl", hash = "sha256:c1bb6fa9e00c846e6829dec2bee8326754adaef5c80626b99233c01923f0342c"}, - {file = "PyMuPDF-1.23.25-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:514b272bfcd897f9ae29384da04167dcdea3b13ce0f2b9099b645314355d037d"}, - {file = "PyMuPDF-1.23.25-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:ef345a5b050d0869ef404845075edd5f4bd7fd99e235f4d32ce85f423779a120"}, - {file = "PyMuPDF-1.23.25-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:b3ade5b349c38ddffb24f8c266fbcd7161f488c43960ff0f03f977d40d4df967"}, - {file = "PyMuPDF-1.23.25-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:111d795a3e840aec2ad66beebd90a5327994ec85ed56fd68312f5463062dbbfa"}, - {file = "PyMuPDF-1.23.25-cp39-none-win32.whl", hash = "sha256:2237ce9897771f4af686cc0c81517ffb020fc1a011b95ccf5ccf05383492bd6d"}, - {file = "PyMuPDF-1.23.25-cp39-none-win_amd64.whl", hash = "sha256:251c9c321a2112716068d5ae11deedd1911d0387cbdd0ef19adb216a3adf882c"}, - {file = "PyMuPDF-1.23.25.tar.gz", hash = "sha256:eb414e92f08107f43576a1fedea28aa837220b15ad58c8e32015435fe96cc03e"}, + {file = "PyMuPDF-1.23.26-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:645a05321aecc8c45739f71f0eb574ce33138d19189582ffa5241fea3a8e2549"}, + {file = "PyMuPDF-1.23.26-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:2dfc9e010669ae92fade6fb72aaea49ebe3b8dcd7ee4dcbbe50115abcaa4d3fe"}, + {file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:734ee380b3abd038602be79114194a3cb74ac102b7c943bcb333104575922c50"}, + {file = "PyMuPDF-1.23.26-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:b22f8d854f8196ad5b20308c1cebad3d5189ed9f0988acbafa043947ea7e6c55"}, + {file = "PyMuPDF-1.23.26-cp310-none-win32.whl", hash = "sha256:cc0f794e3466bc96b5bf79d42fbc1551428751e3fef38ebc10ac70396b676144"}, + {file = "PyMuPDF-1.23.26-cp310-none-win_amd64.whl", hash = "sha256:2eb701247d8e685a24e45899d1175f01a3ce5fc792a4431c91fbb68633b29298"}, + {file = "PyMuPDF-1.23.26-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:e2804a64bb57da414781e312fb0561f6be67658ad57ed4a73dce008b23fc70a6"}, + {file = "PyMuPDF-1.23.26-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:97b40bb22e3056874634617a90e0ed24a5172cf71791b9e25d1d91c6743bc567"}, + {file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:fab8833559bc47ab26ce736f915b8fc1dd37c108049b90396f7cd5e1004d7593"}, + {file = "PyMuPDF-1.23.26-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:f25aafd3e7fb9d7761a22acf2b67d704f04cc36d4dc33a3773f0eb3f4ec3606f"}, + {file = "PyMuPDF-1.23.26-cp311-none-win32.whl", hash = "sha256:05e672ed3e82caca7ef02a88ace30130b1dd392a1190f03b2b58ffe7aa331400"}, + {file = "PyMuPDF-1.23.26-cp311-none-win_amd64.whl", hash = "sha256:92b3c4dd4d0491d495f333be2d41f4e1c155a409bc9d04b5ff29655dccbf4655"}, + {file = "PyMuPDF-1.23.26-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:a217689ede18cc6991b4e6a78afee8a440b3075d53b9dec4ba5ef7487d4547e9"}, + {file = "PyMuPDF-1.23.26-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:42ad2b819b90ce1947e11b90ec5085889df0a2e3aa0207bc97ecacfc6157cabc"}, + {file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:99607649f89a02bba7d8ebe96e2410664316adc95e9337f7dfeff6a154f93049"}, + {file = "PyMuPDF-1.23.26-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:bb42d4b8407b4de7cb58c28f01449f16f32a6daed88afb41108f1aeb3552bdd4"}, + {file = "PyMuPDF-1.23.26-cp312-none-win32.whl", hash = "sha256:c40d044411615e6f0baa7d3d933b3032cf97e168c7fa77d1be8a46008c109aee"}, + {file = "PyMuPDF-1.23.26-cp312-none-win_amd64.whl", hash = "sha256:3f876533aa7f9a94bcd9a0225ce72571b7808260903fec1d95c120bc842fb52d"}, + {file = "PyMuPDF-1.23.26-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:52df831d46beb9ff494f5fba3e5d069af6d81f49abf6b6e799ee01f4f8fa6799"}, + {file = "PyMuPDF-1.23.26-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:0bbb0cf6593e53524f3fc26fb5e6ead17c02c64791caec7c4afe61b677dedf80"}, + {file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:5ef4360f20015673c20cf59b7e19afc97168795188c584254ed3778cde43ce77"}, + {file = "PyMuPDF-1.23.26-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:d7cd88842b2e7f4c71eef4d87c98c35646b80b60e6375392d7ce40e519261f59"}, + {file = "PyMuPDF-1.23.26-cp38-none-win32.whl", hash = "sha256:6577e2f473625e2d0df5f5a3bf1e4519e94ae749733cc9937994d1b256687bfa"}, + {file = "PyMuPDF-1.23.26-cp38-none-win_amd64.whl", hash = "sha256:fbe1a3255b2cd0d769b2da2c4efdd0c0f30d4961a1aac02c0f75cf951b337aa4"}, + {file = "PyMuPDF-1.23.26-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:73fce034f2afea886a59ead2d0caedf27e2b2a8558b5da16d0286882e0b1eb82"}, + {file = "PyMuPDF-1.23.26-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:b3de8618b7cb5b36db611083840b3bcf09b11a893e2d8262f4e042102c7e65de"}, + {file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:879e7f5ad35709d8760ab6103c3d5dac8ab8043a856ab3653fd324af7358ee87"}, + {file = "PyMuPDF-1.23.26-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:deee96c2fd415ded7b5070d8d5b2c60679aee6ed0e28ac0d2cb998060d835c2c"}, + {file = "PyMuPDF-1.23.26-cp39-none-win32.whl", hash = "sha256:9f7f4ef99dd8ac97fb0b852efa3dcbee515798078b6c79a6a13c7b1e7c5d41a4"}, + {file = "PyMuPDF-1.23.26-cp39-none-win_amd64.whl", hash = "sha256:ba9a54552c7afb9ec85432c765e2fa9a81413acfaa7d70db7c9b528297749e5b"}, + {file = "PyMuPDF-1.23.26.tar.gz", hash = "sha256:a904261b317b761b0aa2bd2c1f6cd25d25aa4258be67a90c02a878efc5dca649"}, ] [package.dependencies] @@ -3099,13 +3174,13 @@ files = [ [[package]] name = "pypdf" -version = "4.0.2" +version = "4.1.0" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = false python-versions = ">=3.6" files = [ - {file = "pypdf-4.0.2-py3-none-any.whl", hash = "sha256:a62daa2a24d5a608ba1b6284dde185317ce3644f89b9ebe5314d0c5d1c9f257d"}, - {file = "pypdf-4.0.2.tar.gz", hash = "sha256:3316d9ddfcff5df67ae3cdfe8b945c432aa43e7f970bae7c2a4ab4fe129cd937"}, + {file = "pypdf-4.1.0-py3-none-any.whl", hash = "sha256:16cac912a05200099cef3f347c4c7e0aaf0a6d027603b8f9a973c0ea500dff89"}, + {file = "pypdf-4.1.0.tar.gz", hash = "sha256:01c3257ec908676efd60a4537e525b89d48e0852bc92b4e0aa4cc646feda17cc"}, ] [package.extras] @@ -3237,13 +3312,13 @@ testing = ["filelock"] [[package]] name = "python-dateutil" -version = "2.8.2" +version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, ] [package.dependencies] @@ -3358,13 +3433,13 @@ files = [ [[package]] name = "ragas" -version = "0.1.1" +version = "0.1.3" description = "" optional = false python-versions = "*" files = [ - {file = "ragas-0.1.1-py3-none-any.whl", hash = "sha256:e7b3e65e950fbb7ca697c51d27d0234fa24b82c7209252b9b60b40c0414599cf"}, - {file = "ragas-0.1.1.tar.gz", hash = "sha256:e006b340924ac63c25ccf72e334e767f6ee121131137cf033091b7410da27ece"}, + {file = "ragas-0.1.3-py3-none-any.whl", hash = "sha256:e15828697556a84fec324cbafdbe86ef7e725e5d0f33a4fccb4246dd60cdf256"}, + {file = "ragas-0.1.3.tar.gz", hash = "sha256:37eb0b34489e442210ba223dcc3c96632f75db6ad6a18b0c504005bc059e130b"}, ] [package.dependencies] @@ -3526,13 +3601,13 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] [[package]] name = "rich" -version = "13.7.0" +version = "13.7.1" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"}, - {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"}, + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, ] [package.dependencies] @@ -3558,13 +3633,13 @@ pyasn1 = ">=0.1.3" [[package]] name = "sentry-sdk" -version = "1.40.5" +version = "1.41.0" description = "Python client for Sentry (https://sentry.io)" optional = false python-versions = "*" files = [ - {file = "sentry-sdk-1.40.5.tar.gz", hash = "sha256:d2dca2392cc5c9a2cc9bb874dd7978ebb759682fe4fe889ee7e970ee8dd1c61e"}, - {file = "sentry_sdk-1.40.5-py2.py3-none-any.whl", hash = "sha256:d188b407c9bacbe2a50a824e1f8fb99ee1aeb309133310488c570cb6d7056643"}, + {file = "sentry-sdk-1.41.0.tar.gz", hash = "sha256:4f2d6c43c07925d8cd10dfbd0970ea7cb784f70e79523cca9dbcd72df38e5a46"}, + {file = "sentry_sdk-1.41.0-py2.py3-none-any.whl", hash = "sha256:be4f8f4b29a80b6a3b71f0f31487beb9e296391da20af8504498a328befed53f"}, ] [package.dependencies] @@ -3603,19 +3678,19 @@ tornado = ["tornado (>=5)"] [[package]] name = "setuptools" -version = "69.1.0" +version = "69.1.1" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-69.1.0-py3-none-any.whl", hash = "sha256:c054629b81b946d63a9c6e732bc8b2513a7c3ea645f11d0139a2191d735c60c6"}, - {file = "setuptools-69.1.0.tar.gz", hash = "sha256:850894c4195f09c4ed30dba56213bf7c3f21d86ed6bdaafb5df5972593bfc401"}, + {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"}, + {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "six" @@ -3630,13 +3705,13 @@ files = [ [[package]] name = "sniffio" -version = "1.3.0" +version = "1.3.1" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" files = [ - {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, - {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] [[package]] @@ -3652,60 +3727,60 @@ files = [ [[package]] name = "sqlalchemy" -version = "2.0.27" +version = "2.0.28" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" files = [ - {file = "SQLAlchemy-2.0.27-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d04e579e911562f1055d26dab1868d3e0bb905db3bccf664ee8ad109f035618a"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fa67d821c1fd268a5a87922ef4940442513b4e6c377553506b9db3b83beebbd8"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c7a596d0be71b7baa037f4ac10d5e057d276f65a9a611c46970f012752ebf2d"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:954d9735ee9c3fa74874c830d089a815b7b48df6f6b6e357a74130e478dbd951"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5cd20f58c29bbf2680039ff9f569fa6d21453fbd2fa84dbdb4092f006424c2e6"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:03f448ffb731b48323bda68bcc93152f751436ad6037f18a42b7e16af9e91c07"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-win32.whl", hash = "sha256:d997c5938a08b5e172c30583ba6b8aad657ed9901fc24caf3a7152eeccb2f1b4"}, - {file = "SQLAlchemy-2.0.27-cp310-cp310-win_amd64.whl", hash = "sha256:eb15ef40b833f5b2f19eeae65d65e191f039e71790dd565c2af2a3783f72262f"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c5bad7c60a392850d2f0fee8f355953abaec878c483dd7c3836e0089f046bf6"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3012ab65ea42de1be81fff5fb28d6db893ef978950afc8130ba707179b4284a"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbcd77c4d94b23e0753c5ed8deba8c69f331d4fd83f68bfc9db58bc8983f49cd"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d177b7e82f6dd5e1aebd24d9c3297c70ce09cd1d5d37b43e53f39514379c029c"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:680b9a36029b30cf063698755d277885d4a0eab70a2c7c6e71aab601323cba45"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1306102f6d9e625cebaca3d4c9c8f10588735ef877f0360b5cdb4fdfd3fd7131"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-win32.whl", hash = "sha256:5b78aa9f4f68212248aaf8943d84c0ff0f74efc65a661c2fc68b82d498311fd5"}, - {file = "SQLAlchemy-2.0.27-cp311-cp311-win_amd64.whl", hash = "sha256:15e19a84b84528f52a68143439d0c7a3a69befcd4f50b8ef9b7b69d2628ae7c4"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0de1263aac858f288a80b2071990f02082c51d88335a1db0d589237a3435fe71"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce850db091bf7d2a1f2fdb615220b968aeff3849007b1204bf6e3e50a57b3d32"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dfc936870507da96aebb43e664ae3a71a7b96278382bcfe84d277b88e379b18"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4fbe6a766301f2e8a4519f4500fe74ef0a8509a59e07a4085458f26228cd7cc"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4535c49d961fe9a77392e3a630a626af5baa967172d42732b7a43496c8b28876"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:0fb3bffc0ced37e5aa4ac2416f56d6d858f46d4da70c09bb731a246e70bff4d5"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-win32.whl", hash = "sha256:7f470327d06400a0aa7926b375b8e8c3c31d335e0884f509fe272b3c700a7254"}, - {file = "SQLAlchemy-2.0.27-cp312-cp312-win_amd64.whl", hash = "sha256:f9374e270e2553653d710ece397df67db9d19c60d2647bcd35bfc616f1622dcd"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e97cf143d74a7a5a0f143aa34039b4fecf11343eed66538610debc438685db4a"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7b5a3e2120982b8b6bd1d5d99e3025339f7fb8b8267551c679afb39e9c7c7f1"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e36aa62b765cf9f43a003233a8c2d7ffdeb55bc62eaa0a0380475b228663a38f"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5ada0438f5b74c3952d916c199367c29ee4d6858edff18eab783b3978d0db16d"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b1d9d1bfd96eef3c3faedb73f486c89e44e64e40e5bfec304ee163de01cf996f"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-win32.whl", hash = "sha256:ca891af9f3289d24a490a5fde664ea04fe2f4984cd97e26de7442a4251bd4b7c"}, - {file = "SQLAlchemy-2.0.27-cp37-cp37m-win_amd64.whl", hash = "sha256:fd8aafda7cdff03b905d4426b714601c0978725a19efc39f5f207b86d188ba01"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ec1f5a328464daf7a1e4e385e4f5652dd9b1d12405075ccba1df842f7774b4fc"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ad862295ad3f644e3c2c0d8b10a988e1600d3123ecb48702d2c0f26771f1c396"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48217be1de7d29a5600b5c513f3f7664b21d32e596d69582be0a94e36b8309cb"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e56afce6431450442f3ab5973156289bd5ec33dd618941283847c9fd5ff06bf"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:611068511b5531304137bcd7fe8117c985d1b828eb86043bd944cebb7fae3910"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b86abba762ecfeea359112b2bb4490802b340850bbee1948f785141a5e020de8"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-win32.whl", hash = "sha256:30d81cc1192dc693d49d5671cd40cdec596b885b0ce3b72f323888ab1c3863d5"}, - {file = "SQLAlchemy-2.0.27-cp38-cp38-win_amd64.whl", hash = "sha256:120af1e49d614d2525ac247f6123841589b029c318b9afbfc9e2b70e22e1827d"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d07ee7793f2aeb9b80ec8ceb96bc8cc08a2aec8a1b152da1955d64e4825fcbac"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cb0845e934647232b6ff5150df37ceffd0b67b754b9fdbb095233deebcddbd4a"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fc19ae2e07a067663dd24fca55f8ed06a288384f0e6e3910420bf4b1270cc51"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b90053be91973a6fb6020a6e44382c97739736a5a9d74e08cc29b196639eb979"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2f5c9dfb0b9ab5e3a8a00249534bdd838d943ec4cfb9abe176a6c33408430230"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:33e8bde8fff203de50399b9039c4e14e42d4d227759155c21f8da4a47fc8053c"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-win32.whl", hash = "sha256:d873c21b356bfaf1589b89090a4011e6532582b3a8ea568a00e0c3aab09399dd"}, - {file = "SQLAlchemy-2.0.27-cp39-cp39-win_amd64.whl", hash = "sha256:ff2f1b7c963961d41403b650842dc2039175b906ab2093635d8319bef0b7d620"}, - {file = "SQLAlchemy-2.0.27-py3-none-any.whl", hash = "sha256:1ab4e0448018d01b142c916cc7119ca573803a4745cfe341b8f95657812700ac"}, - {file = "SQLAlchemy-2.0.27.tar.gz", hash = "sha256:86a6ed69a71fe6b88bf9331594fa390a2adda4a49b5c06f98e47bf0d392534f8"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0b148ab0438f72ad21cb004ce3bdaafd28465c4276af66df3b9ecd2037bf252"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bbda76961eb8f27e6ad3c84d1dc56d5bc61ba8f02bd20fcf3450bd421c2fcc9c"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feea693c452d85ea0015ebe3bb9cd15b6f49acc1a31c28b3c50f4db0f8fb1e71"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5da98815f82dce0cb31fd1e873a0cb30934971d15b74e0d78cf21f9e1b05953f"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a5adf383c73f2d49ad15ff363a8748319ff84c371eed59ffd0127355d6ea1da"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56856b871146bfead25fbcaed098269d90b744eea5cb32a952df00d542cdd368"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-win32.whl", hash = "sha256:943aa74a11f5806ab68278284a4ddd282d3fb348a0e96db9b42cb81bf731acdc"}, + {file = "SQLAlchemy-2.0.28-cp310-cp310-win_amd64.whl", hash = "sha256:c6c4da4843e0dabde41b8f2e8147438330924114f541949e6318358a56d1875a"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46a3d4e7a472bfff2d28db838669fc437964e8af8df8ee1e4548e92710929adc"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0d3dd67b5d69794cfe82862c002512683b3db038b99002171f624712fa71aeaa"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c61e2e41656a673b777e2f0cbbe545323dbe0d32312f590b1bc09da1de6c2a02"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0315d9125a38026227f559488fe7f7cee1bd2fbc19f9fd637739dc50bb6380b2"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:af8ce2d31679006e7b747d30a89cd3ac1ec304c3d4c20973f0f4ad58e2d1c4c9"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:81ba314a08c7ab701e621b7ad079c0c933c58cdef88593c59b90b996e8b58fa5"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-win32.whl", hash = "sha256:1ee8bd6d68578e517943f5ebff3afbd93fc65f7ef8f23becab9fa8fb315afb1d"}, + {file = "SQLAlchemy-2.0.28-cp311-cp311-win_amd64.whl", hash = "sha256:ad7acbe95bac70e4e687a4dc9ae3f7a2f467aa6597049eeb6d4a662ecd990bb6"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d3499008ddec83127ab286c6f6ec82a34f39c9817f020f75eca96155f9765097"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9b66fcd38659cab5d29e8de5409cdf91e9986817703e1078b2fdaad731ea66f5"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bea30da1e76cb1acc5b72e204a920a3a7678d9d52f688f087dc08e54e2754c67"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:124202b4e0edea7f08a4db8c81cc7859012f90a0d14ba2bf07c099aff6e96462"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e23b88c69497a6322b5796c0781400692eca1ae5532821b39ce81a48c395aae9"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b6303bfd78fb3221847723104d152e5972c22367ff66edf09120fcde5ddc2e2"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-win32.whl", hash = "sha256:a921002be69ac3ab2cf0c3017c4e6a3377f800f1fca7f254c13b5f1a2f10022c"}, + {file = "SQLAlchemy-2.0.28-cp312-cp312-win_amd64.whl", hash = "sha256:b4a2cf92995635b64876dc141af0ef089c6eea7e05898d8d8865e71a326c0385"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e91b5e341f8c7f1e5020db8e5602f3ed045a29f8e27f7f565e0bdee3338f2c7"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c7b78dfc7278329f27be02c44abc0d69fe235495bb8e16ec7ef1b1a17952db"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3eba73ef2c30695cb7eabcdb33bb3d0b878595737479e152468f3ba97a9c22a4"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5df5d1dafb8eee89384fb7a1f79128118bc0ba50ce0db27a40750f6f91aa99d5"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:2858bbab1681ee5406650202950dc8f00e83b06a198741b7c656e63818633526"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-win32.whl", hash = "sha256:9461802f2e965de5cff80c5a13bc945abea7edaa1d29360b485c3d2b56cdb075"}, + {file = "SQLAlchemy-2.0.28-cp37-cp37m-win_amd64.whl", hash = "sha256:a6bec1c010a6d65b3ed88c863d56b9ea5eeefdf62b5e39cafd08c65f5ce5198b"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:843a882cadebecc655a68bd9a5b8aa39b3c52f4a9a5572a3036fb1bb2ccdc197"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:dbb990612c36163c6072723523d2be7c3eb1517bbdd63fe50449f56afafd1133"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7e4baf9161d076b9a7e432fce06217b9bd90cfb8f1d543d6e8c4595627edb9"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0a5354cb4de9b64bccb6ea33162cb83e03dbefa0d892db88a672f5aad638a75"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fffcc8edc508801ed2e6a4e7b0d150a62196fd28b4e16ab9f65192e8186102b6"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aca7b6d99a4541b2ebab4494f6c8c2f947e0df4ac859ced575238e1d6ca5716b"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-win32.whl", hash = "sha256:8c7f10720fc34d14abad5b647bc8202202f4948498927d9f1b4df0fb1cf391b7"}, + {file = "SQLAlchemy-2.0.28-cp38-cp38-win_amd64.whl", hash = "sha256:243feb6882b06a2af68ecf4bec8813d99452a1b62ba2be917ce6283852cf701b"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fc4974d3684f28b61b9a90fcb4c41fb340fd4b6a50c04365704a4da5a9603b05"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:87724e7ed2a936fdda2c05dbd99d395c91ea3c96f029a033a4a20e008dd876bf"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68722e6a550f5de2e3cfe9da6afb9a7dd15ef7032afa5651b0f0c6b3adb8815d"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:328529f7c7f90adcd65aed06a161851f83f475c2f664a898af574893f55d9e53"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:df40c16a7e8be7413b885c9bf900d402918cc848be08a59b022478804ea076b8"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:426f2fa71331a64f5132369ede5171c52fd1df1bd9727ce621f38b5b24f48750"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-win32.whl", hash = "sha256:33157920b233bc542ce497a81a2e1452e685a11834c5763933b440fedd1d8e2d"}, + {file = "SQLAlchemy-2.0.28-cp39-cp39-win_amd64.whl", hash = "sha256:2f60843068e432311c886c5f03c4664acaef507cf716f6c60d5fde7265be9d7b"}, + {file = "SQLAlchemy-2.0.28-py3-none-any.whl", hash = "sha256:78bb7e8da0183a8301352d569900d9d3594c48ac21dc1c2ec6b3121ed8b6c986"}, + {file = "SQLAlchemy-2.0.28.tar.gz", hash = "sha256:dd53b6c4e6d960600fd6532b79ee28e2da489322fcf6648738134587faf767b6"}, ] [package.dependencies] @@ -4029,13 +4104,13 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6. [[package]] name = "typing-extensions" -version = "4.9.0" +version = "4.10.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, - {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, + {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, + {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, ] [[package]] diff --git a/tests/test_everything.py b/tests/test_everything.py index 5d9bf770e..52a6e9e8b 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -117,6 +117,7 @@ def test_everything(): metric9, metric10, ], + run_async=False, ) From 533e73a03a9851240a718e7e8daed14462b8e2af Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 14:40:50 +0800 Subject: [PATCH 44/59] added async to llamaindex --- deepeval/integrations/llama_index/evaluators.py | 12 ++++++------ deepeval/metrics/toxicity/toxicity.py | 1 + llama_test/chatbot.py | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/deepeval/integrations/llama_index/evaluators.py b/deepeval/integrations/llama_index/evaluators.py index 36a57a60e..ba4e895f0 100644 --- a/deepeval/integrations/llama_index/evaluators.py +++ b/deepeval/integrations/llama_index/evaluators.py @@ -56,7 +56,7 @@ async def aevaluate( include_reason=self.include_reason, model=self.model, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, @@ -108,7 +108,7 @@ async def aevaluate( include_reason=self.include_reason, model=self.model, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, @@ -160,7 +160,7 @@ async def aevaluate( include_reason=self.include_reason, model=self.model, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, @@ -209,7 +209,7 @@ async def aevaluate( model=self.model, include_reason=self.include_reason, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, @@ -261,7 +261,7 @@ async def aevaluate( model=self.model, include_reason=self.include_reason, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, @@ -313,7 +313,7 @@ async def aevaluate( model=self.model, include_reason=self.include_reason, ) - metric.measure(test_case) + await metric.a_measure(test_case) return EvaluationResult( query=query, response=response, diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index f6d250691..f6ee73567 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -78,6 +78,7 @@ async def a_measure( self.verdicts: List[ToxicityVerdict] = ( await self._a_generate_verdicts() ) + self.score = self._calculate_score() self.reason = await self._a_generate_reason() self.success = self.score <= self.threshold diff --git a/llama_test/chatbot.py b/llama_test/chatbot.py index c689a14b6..684afcf6d 100644 --- a/llama_test/chatbot.py +++ b/llama_test/chatbot.py @@ -10,7 +10,7 @@ def query(user_input): res = query_engine.query(user_input) - # evaluator = DeepEvalToxicityEvaluator() - # result = evaluator.evaluate_response(query=user_input, response=res) - # print(result) + evaluator = DeepEvalToxicityEvaluator() + result = evaluator.evaluate_response(query=user_input, response=res) + print(result) return res.response From 4ef8e61090357211680134a90723449d04bbfbeb Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:23:58 +0800 Subject: [PATCH 45/59] reformat --- deepeval/evaluate.py | 26 +++++++++++-------- .../answer_relevancy/answer_relevancy.py | 16 +++++------- deepeval/metrics/base_metric.py | 4 +-- deepeval/metrics/bias/bias.py | 17 ++++++------ .../contextual_precision.py | 18 ++++++------- .../contextual_recall/contextual_recall.py | 17 ++++++------ .../contextual_relevancy.py | 16 +++++------- deepeval/metrics/faithfulness/faithfulness.py | 17 ++++++------ deepeval/metrics/g_eval/g_eval.py | 22 ++++++++-------- .../metrics/hallucination/hallucination.py | 16 +++++------- deepeval/metrics/indicator.py | 24 +++++++++++------ .../metrics/summarization/summarization.py | 16 +++++------- deepeval/metrics/toxicity/toxicity.py | 17 ++++++------ deepeval/utils.py | 9 ------- tests/test_deployment.py | 2 +- tests/test_everything.py | 16 ++++++------ 16 files changed, 121 insertions(+), 132 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 346eb95ed..bee59044b 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -4,7 +4,7 @@ import time from dataclasses import dataclass -from deepeval.utils import drop_and_copy, get_or_create_event_loop +from deepeval.utils import drop_and_copy from deepeval.telemetry import capture_evaluation_count from deepeval.metrics import BaseMetric from deepeval.metrics.indicator import ( @@ -84,7 +84,7 @@ def execute_test_cases( for metric in metrics: # Override metric async - metric.run_async = False + metric.async_mode = False metric.measure(test_case) metric_metadata = MetricsMetadata( @@ -122,6 +122,7 @@ async def a_execute_test_cases( metrics: List[BaseMetric], save_to_disk: bool = False, ) -> List[TestResult]: + print("#####Seems to be ok?##########") test_results: List[TestResult] = [] test_run_manager.save_to_disk = save_to_disk for index, test_case in enumerate(test_cases): @@ -175,8 +176,7 @@ def assert_test( raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") if run_async: - loop = get_or_create_event_loop() - test_result = loop.run_until_complete( + test_result = asyncio.run( a_execute_test_cases( [test_case], metrics, get_is_running_deepeval() ) @@ -205,6 +205,7 @@ def evaluate( metrics: List[BaseMetric], run_async: bool = True, show_indicator: bool = True, + print_results: bool = True, ): if show_indicator is False: disable_indicator() @@ -222,21 +223,24 @@ def evaluate( ) test_run_manager.reset() - print("Evaluating test cases...") + + if print_results: + print("Evaluating test cases...") if run_async: - loop = get_or_create_event_loop() - test_results = loop.run_until_complete( + test_results = asyncio.run( a_execute_test_cases(test_cases, metrics, True) ) else: test_results = execute_test_cases(test_cases, metrics, True) capture_evaluation_count() - for test_result in test_results: - print_test_result(test_result) - print("") - print("-" * 70) + if print_results: + for test_result in test_results: + print_test_result(test_result) + + print("") + print("-" * 70) test_run_manager.wrap_up_test_run(display_table=False) return test_results diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 692c4caba..50dce2d03 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -4,7 +4,6 @@ from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -31,7 +30,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -41,17 +40,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.statements: List[str] = self._generate_statements( test_case.actual_output @@ -69,8 +66,9 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( - self, is_async=True, _show_indicator=_show_indicator + self, async_mode=True, _show_indicator=_show_indicator ): self.statements: List[str] = await self._a_generate_statements( test_case.actual_output diff --git a/deepeval/metrics/base_metric.py b/deepeval/metrics/base_metric.py index 7cd6d3212..ac566f240 100644 --- a/deepeval/metrics/base_metric.py +++ b/deepeval/metrics/base_metric.py @@ -10,7 +10,7 @@ class BaseMetric: reason: Optional[str] = None evaluation_model: Optional[str] = None strict_mode: bool = False - run_async: Optional[bool] = None + async_mode: bool = True @property def threshold(self) -> float: @@ -27,7 +27,7 @@ def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: @abstractmethod async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float: raise NotImplementedError( - f"Async execution for {self.__class__.__name__} not supported yet. Please turn set 'run_async' to 'False'." + f"Async execution for {self.__class__.__name__} not supported yet. Please set 'async_mode' to 'False'." ) @abstractmethod diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 80347905c..28251474f 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -1,5 +1,6 @@ from typing import List, Optional, Union from pydantic import BaseModel, Field +import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -8,7 +9,6 @@ from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.metrics.bias.template import BiasTemplate @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -42,17 +42,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output @@ -68,9 +66,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.opinions: List[str] = await self._a_generate_opinions( diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 229a044f6..314f9fa3b 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -1,9 +1,9 @@ from typing import Optional, List, Union from pydantic import BaseModel +import asyncio from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -35,7 +35,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -45,18 +45,15 @@ def __init__( else: self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) - with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + with metric_progress_indicator(self): + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.verdicts: List[ContextualPrecisionVerdict] = ( self._generate_verdicts( @@ -75,9 +72,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.verdicts: List[ContextualPrecisionVerdict] = ( diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index baf039416..3bed8ec18 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -1,9 +1,9 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field +import asyncio from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -42,17 +42,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.verdicts: List[ContextualRecallVerdict] = ( self._generate_verdicts( @@ -69,9 +67,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.verdicts: List[ContextualRecallVerdict] = ( diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index 3409b42bd..b47d039fb 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -4,7 +4,6 @@ from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -34,7 +33,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -44,17 +43,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.verdicts: List[ContextualRelevancyVerdict] = ( self._generate_verdicts( @@ -71,9 +68,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.verdicts: List[ContextualRelevancyVerdict] = ( diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index ccdcb1fe1..2a46cdb89 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -6,7 +6,6 @@ from deepeval.metrics import BaseMetric from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -32,7 +31,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -42,17 +41,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.truths = self._generate_truths(test_case.retrieval_context) self.claims = self._generate_claims(test_case.actual_output) @@ -67,13 +64,15 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( - self, is_async=True, _show_indicator=_show_indicator + self, async_mode=True, _show_indicator=_show_indicator ): self.truths, self.claims = await asyncio.gather( self._a_generate_truths(test_case.retrieval_context), self._a_generate_claims(test_case.actual_output), ) + print("after truths and claims") self.verdicts = await self._a_generate_verdicts() self.score = self._calculate_score() self.reason = await self._a_generate_reason() diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index 7a017b5f4..c67664a5f 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -1,5 +1,8 @@ +"""LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" + from typing import Optional, List, Tuple, Union from pydantic import BaseModel +import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -7,7 +10,6 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, - get_or_create_event_loop, ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.telemetry import capture_metric_type @@ -53,7 +55,7 @@ def __init__( evaluation_steps: Optional[List[str]] = None, model: Optional[Union[str, DeepEvalBaseLLM]] = None, threshold: float = 0.5, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.name = name @@ -84,19 +86,16 @@ def __init__( self.evaluation_steps = evaluation_steps self.threshold = 1 if strict_mode else threshold self.strict_mode = strict_mode - self.run_async = run_async + self.async_mode = async_mode def measure(self, test_case: LLMTestCase) -> float: - """LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf""" check_test_case_params( test_case, self.evaluation_params, f"GEval({self.__name__})" ) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.evaluation_steps: List[str] = ( self._generate_evaluation_steps() @@ -119,9 +118,10 @@ async def a_measure( check_test_case_params( test_case, self.evaluation_params, f"GEval({self.__name__})" ) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.evaluation_steps: List[str] = ( @@ -201,4 +201,4 @@ def is_successful(self) -> bool: @property def __name__(self): - return f"GEval({self.name})" + return f"GEval ({self.name})" diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index 0e09c42e9..dc7e9e979 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -1,11 +1,11 @@ from typing import Optional, Union, List from pydantic import BaseModel, Field +import asyncio from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.metrics.hallucination.template import HallucinationTemplate @@ -31,7 +31,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = False, + async_mode: bool = False, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -41,17 +41,14 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.verdicts: List[HallucinationVerdict] = ( self._generate_verdicts( @@ -68,8 +65,9 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( - self, is_async=True, _show_indicator=_show_indicator + self, async_mode=True, _show_indicator=_show_indicator ): self.verdicts: List[HallucinationVerdict] = ( await self._a_generate_verdicts( diff --git a/deepeval/metrics/indicator.py b/deepeval/metrics/indicator.py index 8e1b84324..efa1b596b 100644 --- a/deepeval/metrics/indicator.py +++ b/deepeval/metrics/indicator.py @@ -12,20 +12,25 @@ def format_metric_description( - metric: BaseMetric, is_async: Optional[bool] = None + metric: BaseMetric, async_mode: Optional[bool] = None ): - if is_async is None: - run_async = metric.run_async + if async_mode is None: + run_async = metric.async_mode else: - run_async = is_async + run_async = async_mode - return f"✨ You're running DeepEval's latest [rgb(106,0,255)]{metric.__name__} Metric[/rgb(106,0,255)]! [rgb(55,65,81)](using {metric.evaluation_model}, strict={metric.strict_mode})...[/rgb(55,65,81)]" + if run_async: + is_async = "yes" + else: + is_async = "no" + + return f"✨ You're running DeepEval's latest [rgb(106,0,255)]{metric.__name__} Metric[/rgb(106,0,255)]! [rgb(55,65,81)](using {metric.evaluation_model}, strict={metric.strict_mode}, async_mode={run_async})...[/rgb(55,65,81)]" @contextmanager def metric_progress_indicator( metric: BaseMetric, - is_async: Optional[bool] = None, + async_mode: Optional[bool] = None, _show_indicator: bool = True, total: int = 9999, transient: bool = True, @@ -39,7 +44,7 @@ def metric_progress_indicator( transient=transient, ) as progress: progress.add_task( - description=format_metric_description(metric, is_async), + description=format_metric_description(metric, async_mode), total=total, ) yield @@ -76,7 +81,10 @@ async def measure_metrics_with_indicator( tasks = [] for metric in metrics: task_id = progress.add_task( - description=format_metric_description(metric), total=100 + description=format_metric_description( + metric, async_mode=True + ), + total=100, ) tasks.append( measure_metric_task(task_id, progress, metric, test_case) diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 6fb0cbfe0..5dc3fc399 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -8,7 +8,6 @@ from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.metrics.summarization.template import SummarizationTemplate @@ -47,7 +46,7 @@ def __init__( model: Optional[Union[str, DeepEvalBaseLLM]] = None, assessment_questions: Optional[List[str]] = None, include_reason: bool = True, - run_async=True, + async_mode=True, strict_mode: bool = False, ): self.threshold = 1 if strict_mode else threshold @@ -62,19 +61,17 @@ def __init__( else: self.assessment_questions = assessment_questions - self.run_async = run_async + self.async_mode = async_mode self.include_reason = include_reason self.n = n self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.truths: List[str] = self._generate_claims(test_case.input) self.claims: List[str] = self._generate_claims( @@ -102,9 +99,10 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( self, - is_async=True, + async_mode=True, _show_indicator=_show_indicator, ): self.truths, self.claims = await asyncio.gather( diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index f6d250691..24e578556 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -1,5 +1,6 @@ from typing import List, Optional, Union from pydantic import BaseModel, Field +import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -8,7 +9,6 @@ from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.utils import ( trimAndLoadJson, - get_or_create_event_loop, check_test_case_params, ) from deepeval.metrics.bias.template import BiasTemplate @@ -32,7 +32,7 @@ def __init__( threshold: float = 0.5, model: Optional[Union[str, DeepEvalBaseLLM]] = None, include_reason: bool = True, - run_async: bool = True, + async_mode: bool = True, strict_mode: bool = False, ): self.threshold = 0 if strict_mode else threshold @@ -42,17 +42,15 @@ def __init__( self.model = GPTModel(model=model) self.evaluation_model = self.model.get_model_name() self.include_reason = include_reason - self.run_async = run_async + self.async_mode = async_mode self.strict_mode = strict_mode def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator(self): - if self.run_async: - loop = get_or_create_event_loop() - loop.run_until_complete( - self.a_measure(test_case, _show_indicator=False) - ) + if self.async_mode: + asyncio.run(self.a_measure(test_case, _show_indicator=False)) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output @@ -69,8 +67,9 @@ async def a_measure( self, test_case: LLMTestCase, _show_indicator: bool = True ) -> float: check_test_case_params(test_case, required_params, self.__name__) + with metric_progress_indicator( - self, is_async=True, _show_indicator=_show_indicator + self, async_mode=True, _show_indicator=_show_indicator ): self.opinions: List[str] = await self._a_generate_opinions( test_case.actual_output diff --git a/deepeval/utils.py b/deepeval/utils.py index 3765d9f07..6c99f26c6 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -56,15 +56,6 @@ def check_test_case_params( ) -def get_or_create_event_loop() -> asyncio.AbstractEventLoop: - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop - - def login_with_confident_api_key(api_key: string): from rich import print diff --git a/tests/test_deployment.py b/tests/test_deployment.py index 9ea659b46..5cef9b2e9 100644 --- a/tests/test_deployment.py +++ b/tests/test_deployment.py @@ -13,7 +13,7 @@ class FakeMetric(BaseMetric): def __init__(self, threshold: float = 0.5): self.threshold = threshold - def measure(self, test_case: LLMTestCase, _run_async): + def measure(self, test_case: LLMTestCase): # Set self.success and self.score in the "measure" method self.score = random.uniform(0.0, 1.0) self.success = self.score >= self.threshold diff --git a/tests/test_everything.py b/tests/test_everything.py index 52a6e9e8b..9641efd0a 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -69,13 +69,13 @@ being composed mostly of rock and metal. """ -strict_mode = False +strict_mode = True @pytest.mark.skip(reason="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( - threshold=0.5, strict_mode=strict_mode, run_async=True + threshold=0.5, strict_mode=strict_mode, async_mode=True ) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) @@ -97,11 +97,11 @@ def test_everything(): ) test_case = LLMTestCase( - input=question, - actual_output=answer, - expected_output=answer, - retrieval_context=[one, two, three], - context=[four, five], + input="What is this?", + actual_output="this is a latte", + expected_output="this is a mocha", + retrieval_context=["I love coffee"], + context=["I love coffee"], ) assert_test( test_case, @@ -117,7 +117,7 @@ def test_everything(): metric9, metric10, ], - run_async=False, + # run_async=False, ) From 42c2d7a9684b7478bc9903192140edfb49e63c18 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:27:57 +0800 Subject: [PATCH 46/59] fix --- deepeval/evaluate.py | 1 - deepeval/metrics/faithfulness/faithfulness.py | 1 - tests/test_everything.py | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index bee59044b..a0ce01872 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -122,7 +122,6 @@ async def a_execute_test_cases( metrics: List[BaseMetric], save_to_disk: bool = False, ) -> List[TestResult]: - print("#####Seems to be ok?##########") test_results: List[TestResult] = [] test_run_manager.save_to_disk = save_to_disk for index, test_case in enumerate(test_cases): diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 2a46cdb89..2e3e12143 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -72,7 +72,6 @@ async def a_measure( self._a_generate_truths(test_case.retrieval_context), self._a_generate_claims(test_case.actual_output), ) - print("after truths and claims") self.verdicts = await self._a_generate_verdicts() self.score = self._calculate_score() self.reason = await self._a_generate_reason() diff --git a/tests/test_everything.py b/tests/test_everything.py index 9641efd0a..8ebe2745b 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -72,10 +72,10 @@ strict_mode = True -@pytest.mark.skip(reason="openai is expensive") +# @pytest.mark.skip(reason="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( - threshold=0.5, strict_mode=strict_mode, async_mode=True + threshold=0.5, strict_mode=strict_mode, async_mode=False ) metric2 = FaithfulnessMetric(threshold=0.5, strict_mode=strict_mode) metric3 = ContextualPrecisionMetric(threshold=0.5, strict_mode=strict_mode) From ff667634cdc27d7d5cc98df345c3dd73f398d7e7 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:45:21 +0800 Subject: [PATCH 47/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 172fe38af..0cd4febac 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.81" +__version__: str = "0.20.82" diff --git a/pyproject.toml b/pyproject.toml index 28f75735d..0679f57cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.81" +version = "0.20.82" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From bd9ed3d53b63c8813df963d233047583771e3b73 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:46:11 +0800 Subject: [PATCH 48/59] update test --- tests/test_everything.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_everything.py b/tests/test_everything.py index 8ebe2745b..30c8576ce 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -72,7 +72,7 @@ strict_mode = True -# @pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( threshold=0.5, strict_mode=strict_mode, async_mode=False From 197e889b52ef7995b92b6876701b2a52dd7be910 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:46:45 +0800 Subject: [PATCH 49/59] . --- tests/test_everything.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_everything.py b/tests/test_everything.py index 30c8576ce..c18d04cd7 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -72,7 +72,7 @@ strict_mode = True -@pytest.mark.skip(reason="openai is expensive") +@pytest.mark.skip(reasgon="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( threshold=0.5, strict_mode=strict_mode, async_mode=False From 61fc3f8011993cdf1ea36145b062d50959c865c3 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 16:47:25 +0800 Subject: [PATCH 50/59] . --- tests/test_everything.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_everything.py b/tests/test_everything.py index c18d04cd7..30c8576ce 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -72,7 +72,7 @@ strict_mode = True -@pytest.mark.skip(reasgon="openai is expensive") +@pytest.mark.skip(reason="openai is expensive") def test_everything(): metric1 = AnswerRelevancyMetric( threshold=0.5, strict_mode=strict_mode, async_mode=False From c6b034f6888c24ef66b749dcf22d43155d6aee3d Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 17:20:39 +0800 Subject: [PATCH 51/59] fix asyncio run --- deepeval/evaluate.py | 9 +++++---- deepeval/integrations/llama_index/evaluators.py | 2 +- deepeval/metrics/answer_relevancy/answer_relevancy.py | 7 +++++-- deepeval/metrics/bias/bias.py | 7 +++++-- .../contextual_precision/contextual_precision.py | 7 +++++-- .../metrics/contextual_recall/contextual_recall.py | 7 +++++-- .../contextual_relevancy/contextual_relevancy.py | 8 ++++++-- deepeval/metrics/faithfulness/faithfulness.py | 6 +++++- deepeval/metrics/g_eval/g_eval.py | 7 +++++-- deepeval/metrics/hallucination/hallucination.py | 7 +++++-- deepeval/metrics/indicator.py | 2 +- deepeval/metrics/summarization/summarization.py | 8 ++++++-- deepeval/metrics/toxicity/toxicity.py | 7 +++++-- deepeval/utils.py | 11 +++++++++++ tests/test_everything.py | 2 +- tests/test_faithfulness.py | 1 - 16 files changed, 71 insertions(+), 27 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index a0ce01872..1bf31e914 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -1,10 +1,9 @@ -import asyncio import os from typing import List, Optional import time from dataclasses import dataclass -from deepeval.utils import drop_and_copy +from deepeval.utils import drop_and_copy, get_or_create_event_loop from deepeval.telemetry import capture_evaluation_count from deepeval.metrics import BaseMetric from deepeval.metrics.indicator import ( @@ -175,7 +174,8 @@ def assert_test( raise TypeError("'test_case' must be an instance of 'LLMTestCase'.") if run_async: - test_result = asyncio.run( + loop = get_or_create_event_loop() + test_result = loop.run_until_complete( a_execute_test_cases( [test_case], metrics, get_is_running_deepeval() ) @@ -226,7 +226,8 @@ def evaluate( if print_results: print("Evaluating test cases...") if run_async: - test_results = asyncio.run( + loop = get_or_create_event_loop() + test_results = loop.run_until_complete( a_execute_test_cases(test_cases, metrics, True) ) else: diff --git a/deepeval/integrations/llama_index/evaluators.py b/deepeval/integrations/llama_index/evaluators.py index ba4e895f0..40637805e 100644 --- a/deepeval/integrations/llama_index/evaluators.py +++ b/deepeval/integrations/llama_index/evaluators.py @@ -1,6 +1,6 @@ -import asyncio from typing import Optional, Sequence, Any from llama_index.core.evaluation.base import BaseEvaluator, EvaluationResult +import asyncio from deepeval.test_case import LLMTestCase from deepeval.metrics import ( diff --git a/deepeval/metrics/answer_relevancy/answer_relevancy.py b/deepeval/metrics/answer_relevancy/answer_relevancy.py index 50dce2d03..73200a737 100644 --- a/deepeval/metrics/answer_relevancy/answer_relevancy.py +++ b/deepeval/metrics/answer_relevancy/answer_relevancy.py @@ -1,10 +1,10 @@ -import asyncio from typing import Optional, List, Union from pydantic import BaseModel, Field from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -48,7 +48,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.statements: List[str] = self._generate_statements( test_case.actual_output diff --git a/deepeval/metrics/bias/bias.py b/deepeval/metrics/bias/bias.py index 28251474f..35e55fc68 100644 --- a/deepeval/metrics/bias/bias.py +++ b/deepeval/metrics/bias/bias.py @@ -1,6 +1,5 @@ from typing import List, Optional, Union from pydantic import BaseModel, Field -import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -10,6 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.metrics.bias.template import BiasTemplate @@ -50,7 +50,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output diff --git a/deepeval/metrics/contextual_precision/contextual_precision.py b/deepeval/metrics/contextual_precision/contextual_precision.py index 314f9fa3b..46471e956 100644 --- a/deepeval/metrics/contextual_precision/contextual_precision.py +++ b/deepeval/metrics/contextual_precision/contextual_precision.py @@ -1,10 +1,10 @@ from typing import Optional, List, Union from pydantic import BaseModel -import asyncio from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -53,7 +53,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[ContextualPrecisionVerdict] = ( self._generate_verdicts( diff --git a/deepeval/metrics/contextual_recall/contextual_recall.py b/deepeval/metrics/contextual_recall/contextual_recall.py index 3bed8ec18..a0e72a00e 100644 --- a/deepeval/metrics/contextual_recall/contextual_recall.py +++ b/deepeval/metrics/contextual_recall/contextual_recall.py @@ -1,10 +1,10 @@ from typing import Optional, List, Union from pydantic import BaseModel, Field -import asyncio from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -50,7 +50,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[ContextualRecallVerdict] = ( self._generate_verdicts( diff --git a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py index b47d039fb..42e55b09a 100644 --- a/deepeval/metrics/contextual_relevancy/contextual_relevancy.py +++ b/deepeval/metrics/contextual_relevancy/contextual_relevancy.py @@ -1,10 +1,11 @@ -import asyncio from typing import Optional, List, Union from pydantic import BaseModel, Field +import asyncio from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -51,7 +52,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[ContextualRelevancyVerdict] = ( self._generate_verdicts( diff --git a/deepeval/metrics/faithfulness/faithfulness.py b/deepeval/metrics/faithfulness/faithfulness.py index 2e3e12143..8ef2369a7 100644 --- a/deepeval/metrics/faithfulness/faithfulness.py +++ b/deepeval/metrics/faithfulness/faithfulness.py @@ -7,6 +7,7 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.metrics.faithfulness.template import FaithfulnessTemplate @@ -49,7 +50,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.truths = self._generate_truths(test_case.retrieval_context) self.claims = self._generate_claims(test_case.actual_output) diff --git a/deepeval/metrics/g_eval/g_eval.py b/deepeval/metrics/g_eval/g_eval.py index c67664a5f..175a2fc07 100644 --- a/deepeval/metrics/g_eval/g_eval.py +++ b/deepeval/metrics/g_eval/g_eval.py @@ -2,7 +2,6 @@ from typing import Optional, List, Tuple, Union from pydantic import BaseModel -import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -10,6 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.models import GPTModel, DeepEvalBaseLLM from deepeval.telemetry import capture_metric_type @@ -95,7 +95,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.evaluation_steps: List[str] = ( self._generate_evaluation_steps() diff --git a/deepeval/metrics/hallucination/hallucination.py b/deepeval/metrics/hallucination/hallucination.py index dc7e9e979..3c5af900d 100644 --- a/deepeval/metrics/hallucination/hallucination.py +++ b/deepeval/metrics/hallucination/hallucination.py @@ -1,12 +1,12 @@ from typing import Optional, Union, List from pydantic import BaseModel, Field -import asyncio from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.metrics.hallucination.template import HallucinationTemplate from deepeval.models import GPTModel, DeepEvalBaseLLM @@ -48,7 +48,10 @@ def measure(self, test_case: LLMTestCase) -> float: check_test_case_params(test_case, required_params, self.__name__) with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.verdicts: List[HallucinationVerdict] = ( self._generate_verdicts( diff --git a/deepeval/metrics/indicator.py b/deepeval/metrics/indicator.py index efa1b596b..055c34d73 100644 --- a/deepeval/metrics/indicator.py +++ b/deepeval/metrics/indicator.py @@ -1,10 +1,10 @@ -import asyncio from rich.console import Console from rich.progress import Progress, SpinnerColumn, TextColumn from contextlib import contextmanager import sys from typing import List, Optional import time +import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase diff --git a/deepeval/metrics/summarization/summarization.py b/deepeval/metrics/summarization/summarization.py index 5dc3fc399..7f066e884 100644 --- a/deepeval/metrics/summarization/summarization.py +++ b/deepeval/metrics/summarization/summarization.py @@ -1,7 +1,7 @@ -import asyncio from typing import List, Optional, Union from enum import Enum from pydantic import BaseModel, Field +import asyncio from deepeval.test_case import LLMTestCase, LLMTestCaseParams from deepeval.metrics import BaseMetric @@ -9,6 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.metrics.summarization.template import SummarizationTemplate from deepeval.metrics.faithfulness.template import FaithfulnessTemplate @@ -71,7 +72,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.truths: List[str] = self._generate_claims(test_case.input) self.claims: List[str] = self._generate_claims( diff --git a/deepeval/metrics/toxicity/toxicity.py b/deepeval/metrics/toxicity/toxicity.py index 00fe080fc..c76c2ffe2 100644 --- a/deepeval/metrics/toxicity/toxicity.py +++ b/deepeval/metrics/toxicity/toxicity.py @@ -1,6 +1,5 @@ from typing import List, Optional, Union from pydantic import BaseModel, Field -import asyncio from deepeval.metrics import BaseMetric from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -10,6 +9,7 @@ from deepeval.utils import ( trimAndLoadJson, check_test_case_params, + get_or_create_event_loop, ) from deepeval.metrics.bias.template import BiasTemplate from deepeval.metrics.toxicity.template import ToxicityTemplate @@ -50,7 +50,10 @@ def measure(self, test_case: LLMTestCase) -> float: with metric_progress_indicator(self): if self.async_mode: - asyncio.run(self.a_measure(test_case, _show_indicator=False)) + loop = get_or_create_event_loop() + loop.run_until_complete( + self.a_measure(test_case, _show_indicator=False) + ) else: self.opinions: List[str] = self._generate_opinions( test_case.actual_output diff --git a/deepeval/utils.py b/deepeval/utils.py index 6c99f26c6..cc199b7f1 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -17,6 +17,17 @@ from deepeval.test_case import LLMTestCase, LLMTestCaseParams +def get_or_create_event_loop() -> asyncio.AbstractEventLoop: + try: + loop = asyncio.get_event_loop() + if loop.is_closed(): + raise RuntimeError + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + def show_indicator(): try: if os.environ["DISABLE_DEEPEVAL_INDICATOR"] == "YES": diff --git a/tests/test_everything.py b/tests/test_everything.py index 30c8576ce..591430e68 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -117,7 +117,7 @@ def test_everything(): metric9, metric10, ], - # run_async=False, + run_async=False, ) diff --git a/tests/test_faithfulness.py b/tests/test_faithfulness.py index 8c4af8365..15a49159a 100644 --- a/tests/test_faithfulness.py +++ b/tests/test_faithfulness.py @@ -1,4 +1,3 @@ -import asyncio import pytest from deepeval.test_case import LLMTestCase from deepeval.metrics import FaithfulnessMetric From 8deee855a98ff9bfda1a4f16586ac215ff8c191e Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 17:23:00 +0800 Subject: [PATCH 52/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 0cd4febac..f17a4fa3b 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.82" +__version__: str = "0.20.83" diff --git a/pyproject.toml b/pyproject.toml index 0679f57cd..311f53f31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.82" +version = "0.20.83" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From 292242e34417b059e3d6dfa5c6b0849eeeb96a5f Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 18:11:51 +0800 Subject: [PATCH 53/59] allow nested loops --- deepeval/utils.py | 7 +++++++ tests/test_everything.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/deepeval/utils.py b/deepeval/utils.py index cc199b7f1..b648f5677 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -12,6 +12,7 @@ from dataclasses import asdict, is_dataclass import re import asyncio +import nest_asyncio from deepeval.key_handler import KeyValues, KEY_FILE_HANDLER from deepeval.test_case import LLMTestCase, LLMTestCaseParams @@ -20,6 +21,12 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop: try: loop = asyncio.get_event_loop() + if loop.is_running(): + print( + "Event loop is already running. Applying nest_asyncio patch to allow async execution..." + ) + nest_asyncio.apply() + if loop.is_closed(): raise RuntimeError except RuntimeError: diff --git a/tests/test_everything.py b/tests/test_everything.py index 591430e68..30c8576ce 100644 --- a/tests/test_everything.py +++ b/tests/test_everything.py @@ -117,7 +117,7 @@ def test_everything(): metric9, metric10, ], - run_async=False, + # run_async=False, ) From d3694e045bce38d499ccdf974527d8e70a5f57bc Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 18:16:50 +0800 Subject: [PATCH 54/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index f17a4fa3b..0b5d0da7d 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.83" +__version__: str = "0.20.84" diff --git a/pyproject.toml b/pyproject.toml index 311f53f31..8ed3bbe9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.83" +version = "0.20.84" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From 5a1ae66e3dcb09b580183f4272e240dbb41a417a Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 18:34:44 +0800 Subject: [PATCH 55/59] fix indicator env --- deepeval/evaluate.py | 5 ++--- deepeval/utils.py | 7 +++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 1bf31e914..6931dd51c 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -13,7 +13,7 @@ from deepeval.tracing import get_trace_stack from deepeval.constants import PYTEST_RUN_TEST_NAME from deepeval.test_run import test_run_manager, APITestCase, MetricsMetadata -from deepeval.utils import get_is_running_deepeval, disable_indicator +from deepeval.utils import get_is_running_deepeval, set_indicator @dataclass @@ -206,8 +206,7 @@ def evaluate( show_indicator: bool = True, print_results: bool = True, ): - if show_indicator is False: - disable_indicator() + set_indicator(show_indicator) # TODO: refactor for metric in metrics: diff --git a/deepeval/utils.py b/deepeval/utils.py index b648f5677..7e02ca0e2 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -45,8 +45,11 @@ def show_indicator(): return True -def disable_indicator(): - os.environ["DISABLE_DEEPEVAL_INDICATOR"] = "YES" +def set_indicator(show_indicator: bool): + if show_indicator: + os.environ["DISABLE_DEEPEVAL_INDICATOR"] = "NO" + else: + os.environ["DISABLE_DEEPEVAL_INDICATOR"] = "YES" def check_test_case_params( From bb40704730687db980d0ac3d751b3555d7395df1 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 18:36:31 +0800 Subject: [PATCH 56/59] new release --- deepeval/_version.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deepeval/_version.py b/deepeval/_version.py index 0b5d0da7d..f06c02a39 100644 --- a/deepeval/_version.py +++ b/deepeval/_version.py @@ -1 +1 @@ -__version__: str = "0.20.84" +__version__: str = "0.20.85" diff --git a/pyproject.toml b/pyproject.toml index 8ed3bbe9c..4015aeee5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "deepeval" -version = "0.20.84" +version = "0.20.85" description = "The Evaluation Framework for LLMs" authors = ["Jeffrey Ip "] license = "Apache-2.0" From 4fade334dfe781c69d1cfb372722ee243f500c34 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sat, 9 Mar 2024 21:42:55 +0800 Subject: [PATCH 57/59] updated docs --- docs/docs/evaluation-datasets.mdx | 7 +-- docs/docs/evaluation-introduction.mdx | 17 ++++- docs/docs/evaluation-test-cases.mdx | 59 +++++++---------- docs/docs/metrics-answer-relevancy.mdx | 4 +- docs/docs/metrics-bias.mdx | 4 +- docs/docs/metrics-contextual-precision.mdx | 4 +- docs/docs/metrics-contextual-recall.mdx | 4 +- docs/docs/metrics-contextual-relevancy.mdx | 5 +- docs/docs/metrics-faithfulness.mdx | 5 +- docs/docs/metrics-hallucination.mdx | 5 +- docs/docs/metrics-introduction.mdx | 73 ++++++++++++++++++---- docs/docs/metrics-llm-evals.mdx | 6 +- docs/docs/metrics-summarization.mdx | 7 ++- docs/docs/metrics-toxicity.mdx | 2 + 14 files changed, 132 insertions(+), 70 deletions(-) diff --git a/docs/docs/evaluation-datasets.mdx b/docs/docs/evaluation-datasets.mdx index c0532b1e8..63d1c4183 100644 --- a/docs/docs/evaluation-datasets.mdx +++ b/docs/docs/evaluation-datasets.mdx @@ -113,8 +113,8 @@ There are two optional parameters when creating a `Synthesizer`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent generation of goldens. Defaulted to `True`. -:::caution -We highly recommend you to call `save_as()` to save all generated synthetic data. +:::tip +Remember to to call `save_as()` just to be safe. ::: ## Load an Existing Dataset @@ -225,7 +225,7 @@ deepeval test run test_bulk.py -n 3 ## Evaluate Your Dataset Without Pytest -Alternately, you can use deepeval's `evaluate` function to evaluate datasets. This approach avoids the CLI, but does not allow for parallel test execution. +Alternately, you can use `deepeval`'s `evaluate` function to evaluate datasets. This approach avoids the CLI, but does not allow for parallel test execution. ```python from deepeval import evaluate @@ -233,7 +233,6 @@ from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric from deepeval.dataset import EvaluationDataset dataset = EvaluationDataset(test_cases=[...]) - hallucination_metric = HallucinationMetric(threshold=0.3) answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5) diff --git a/docs/docs/evaluation-introduction.mdx b/docs/docs/evaluation-introduction.mdx index a452d718e..22a9a216c 100644 --- a/docs/docs/evaluation-introduction.mdx +++ b/docs/docs/evaluation-introduction.mdx @@ -127,6 +127,12 @@ And run the test file in the CLI: deepeval test run test_example.py ``` +There are two mandatory and one optional parameter when calling the `assert_test()` function: + +- `test_case`: a `LLMTestCase` +- `metrics`: a list of metrics of type `BaseMetric` +- [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. + :::info `@pytest.mark.parametrize` is a decorator offered by Pytest. It simply loops through your `EvaluationDataset` to evaluate each test case individually. ::: @@ -153,7 +159,7 @@ def function_to_be_called_after_test_run(): ## Evaluating Without Pytest -Alternately, you can use deepeval's `evaluate` function. This approach avoids the CLI (if you're in a notebook environment), but does not allow for parallel test execution. +Alternately, you can use `deepeval`'s `evaluate` function. This approach avoids the CLI (if you're in a notebook environment), but does not allow for parallel test execution. ```python from deepeval import evaluate @@ -161,12 +167,19 @@ from deepeval.metrics import AnswerRelevancyMetric from deepeval.dataset import EvaluationDataset dataset = EvaluationDataset(test_cases=[...]) - answer_relevancy_metric = AnswerRelevancyMetric() evaluate(dataset, [answer_relevancy_metric]) ``` +There are two mandatory and three optional arguments when calling the `evaluate()` function: + +- `test_case`: a `LLMTestCase` +- `metrics`: a list of metrics of type `BaseMetric` +- [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. +- [Optional] `show_indicator`: a boolean which when set to `True`, shows the progress indicator for each individual metric. Defaulted to `True`. +- [Optional] `print_results`: a boolean which when set to `True`, prints the result of each evaluation. Defaulted to `True`. + :::tip You can also replace `dataset` with a list of test cases, as shown in the [test cases section.](evaluation-test-cases#evaluate-test-cases-in-bulk) ::: diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx index 086c26d03..3a93e0ac5 100644 --- a/docs/docs/evaluation-test-cases.mdx +++ b/docs/docs/evaluation-test-cases.mdx @@ -29,7 +29,7 @@ test_case = LLMTestCase( ) ``` -**Note that only `input` and `actual_output` is mandatory.** +**Note that only `input` and `actual_output` are mandatory.** However, depending on the specific metric you're evaluating your test cases on, you may or may not require a `retrieval_context`, `expected_output` and/or `context` as additional parameters. For example, you won't need `expected_output` and `context` if you're just measuring answer relevancy, but if you're evaluating hallucination you'll have to provide `context` in order for `deepeval` to know what the **ground truth** is. @@ -210,38 +210,6 @@ test_case = LLMTestCase( Similar to the `LatencyMetric`, the [`CostMetric`](metrics-cost) is the only `deepeval` metric that uses the `cost` parameter. ::: -## Run A Test Case - -`deepeval` offers an option to quickly run a test case without going through the CLI. - -```python -# A hypothetical LLM application example -import chatbot -from deepeval import run_test -from deepeval.metrics import HallucinationMetric -from deepeval.test_case import LLMTestCase - -prompt_template = """ - Impersonate a dog named Rocky when replying to the text below. - - {text} -""" - -prompt = prompt_template.format(text="Who's a good boy?") -context = ["Rocky is a good boy."] - -test_case = LLMTestCase( - input=prompt, - # Replace this with your actual LLM application - actual_output=chatbot.run(prompt), - expected_output="Me, ruff!", - context=context -) - -metric = HallucinationMetric(threshold=0.7) -run_test(test_case, [metric]) -``` - ## Assert A Test Case Before we begin going through the final sections, we highly recommend you to login to [Confident AI](https://confident-ai.com) (the platform powering deepeval) via the CLI. This way, you can keep track of all evaluation results generated each time you execute `deepeval test run`. @@ -250,10 +218,7 @@ Before we begin going through the final sections, we highly recommend you to log deepeval login ``` -Similar to Pytest, `deepeval` allows you to assert any test case you create by calling the `assert_test` function by running `deepeval test run` via the CLI. `assert_test` takes two mandatory arguments: - -- `test_case`: an `LLMTestCase` -- `metrics`: a list of metrics +Similar to Pytest, `deepeval` allows you to assert any test case you create by calling the `assert_test` function by running `deepeval test run` via the CLI. **A test case passes only if all metrics passess.** Depending on the metric, a combination of `input`, `actual_output`, `expected_output`, `context`, and `retrieval_context` is used to ascertain whether their criterion have been met. @@ -299,7 +264,17 @@ def test_case_2(): assert_test(test_case, metrics=[metric]) ``` -In the CLI, run the command `deepeval test run`, which uses Pytest under the hood. You can also include an optional `-n` flag follow by a number (that determines the number of processes that will be used) to run tests in parallel. +There are two mandatory and one optional parameter when calling the `assert_test()` function: + +- `test_case`: a `LLMTestCase` +- `metrics`: a list of metrics of type `BaseMetric` +- [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. + +:::info +The `run_async` parameter overrides the `async_mode` property of all metrics being evaluated. The `async_mode` property, as you'll learn later in the [metrics section](metrics-introduction), determines whether each metric can execute asynchronously. +::: + +To execute the test cases, run `deepeval test run` via the CLI, which uses `deepeval`'s Pytest integration under the hood to execute these tests. You can also include an optional `-n` flag follow by a number (that determines the number of processes that will be used) to run tests in parallel. ```console deepeval test run test_assert_example.py -n 4 @@ -347,4 +322,12 @@ metric = HallucinationMetric(threshold=0.7) evaluate(test_cases, [metric]) ``` +There are two mandatory and three optional arguments when calling the `evaluate()` function: + +- `test_case`: a `LLMTestCase` +- `metrics`: a list of metrics of type `BaseMetric` +- [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. +- [Optional] `show_indicator`: a boolean which when set to `True`, shows the progress indicator for each individual metric. Defaulted to `True`. +- [Optional] `print_results`: a boolean which when set to `True`, prints the result of each evaluation. Defaulted to `True`. + Similar to `assert_test`, `evaluate` allows you to log and view test results on Confident AI. For more examples of `evaluate`, visit the [datasets section](evaluation-datasets). diff --git a/docs/docs/metrics-answer-relevancy.mdx b/docs/docs/metrics-answer-relevancy.mdx index 1ffbd78c7..b79c4634e 100644 --- a/docs/docs/metrics-answer-relevancy.mdx +++ b/docs/docs/metrics-answer-relevancy.mdx @@ -47,11 +47,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are three optional parameters when creating an `AnswerRelevancyMetric`: +There are five optional parameters when creating an `AnswerRelevancyMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx index ea1fe01da..e18b8f97c 100644 --- a/docs/docs/metrics-bias.mdx +++ b/docs/docs/metrics-bias.mdx @@ -37,11 +37,13 @@ print(metric.score) print(metric.reason) ``` -There are three optional parameters when creating a `BiasMetric`: +There are five optional parameters when creating a `BiasMetric`: - [Optional] `threshold`: a float representing the maximum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. :::note Unlike other metrics you've seen so far, the `threshold` for the `BiasMetric` is instead a maxmium threshold. diff --git a/docs/docs/metrics-contextual-precision.mdx b/docs/docs/metrics-contextual-precision.mdx index 9d19447ec..adaab4e73 100644 --- a/docs/docs/metrics-contextual-precision.mdx +++ b/docs/docs/metrics-contextual-precision.mdx @@ -53,11 +53,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are three optional parameters when creating a `ContextualPrecisionMetric`: +There are five optional parameters when creating a `ContextualPrecisionMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-contextual-recall.mdx b/docs/docs/metrics-contextual-recall.mdx index cef2b9499..f4bb5efa4 100644 --- a/docs/docs/metrics-contextual-recall.mdx +++ b/docs/docs/metrics-contextual-recall.mdx @@ -53,11 +53,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are three optional parameters when creating a `ContextualRecallMetric`: +There are five optional parameters when creating a `ContextualRecallMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-contextual-relevancy.mdx b/docs/docs/metrics-contextual-relevancy.mdx index 30dbd017f..2bfe7def8 100644 --- a/docs/docs/metrics-contextual-relevancy.mdx +++ b/docs/docs/metrics-contextual-relevancy.mdx @@ -52,12 +52,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are four optional parameters when creating a `ContextualRelevancyMetricMetric`: +There are five optional parameters when creating a `ContextualRelevancyMetricMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. -- [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent evaluation of said metric. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-faithfulness.mdx b/docs/docs/metrics-faithfulness.mdx index e58c032ef..a7e3ec206 100644 --- a/docs/docs/metrics-faithfulness.mdx +++ b/docs/docs/metrics-faithfulness.mdx @@ -52,12 +52,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are four optional parameters when creating a `FaithfulnessMetric`: +There are five optional parameters when creating a `FaithfulnessMetric`: - [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. -- [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent evaluation of said metric. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-hallucination.mdx b/docs/docs/metrics-hallucination.mdx index 033fdabd7..e6b852b69 100644 --- a/docs/docs/metrics-hallucination.mdx +++ b/docs/docs/metrics-hallucination.mdx @@ -52,12 +52,13 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are four optional parameters when creating a `HallucinationMetric`: +There are five optional parameters when creating a `HallucinationMetric`: - [Optional] `threshold`: a float representing the maximum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. -- [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent evaluation of said metric. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx index 8ae47f1c8..4bb83e229 100644 --- a/docs/docs/metrics-introduction.mdx +++ b/docs/docs/metrics-introduction.mdx @@ -101,10 +101,15 @@ class AzureOpenAI(DeepEvalBaseLLM): def load_model(self): return self.model - def _call(self, prompt: str) -> str: + def generate(self, prompt: str) -> str: chat_model = self.load_model() return chat_model.invoke(prompt).content + async def a_generate(self, prompt: str) -> str: + chat_model = self.load_model() + res = await chat_model.ainvoke(prompt) + return res.content + def get_model_name(self): return "Custom Azure OpenAI Model" @@ -122,12 +127,19 @@ print(azure_openai("Write me a joke")) When creating a custom LLM evaluation model you should **ALWAYS**: - inherit `DeepEvalBaseLLM`. +- implement the `get_model_name()` method, which simply returns a string representing your custom model name. - implement the `load_model()` method, which will be responsible for returning a model object. -- implement the `_call()` method with **one and only one** parameter of type string that acts as the prompt to your custom LLM. -- the `_call()` method should return the final output string of your custom LLM. Note that we called `chat_model.invoke(prompt).content` to access the model output in this particular example, but this could be different depending on the implementation of your custom LLM object. -- the `get_model_name()` method simply returns a string representing the name of your LLM model. +- implement the `generate()` method with **one and only one** parameter of type string that acts as the prompt to your custom LLM. +- the `generate()` method should return the final output string of your custom LLM. Note that we called `chat_model.invoke(prompt).content` to access the model generations in this particular example, but this could be different depending on the implementation of your custom model object. +- implement the `a_generate()` method, with the same function signature as `generate()`. **Note that this is an async method**. In this example, we called `await chat_model.ainvoke(prompt)`, which is an asynchronous wrapper provided by LangChain's chat models. -Note that the `model` argument in the `__init__()` method can accept any type (the model string or object itself). Lastly, to use it for evaluation in LLM-based metrics: +:::tip +The `a_generate()` method is what `deepeval` uses to generate LLM outputs when you execute metrics / run evaluations asynchronously. + +If your custom model object does not have an asynchronous interface, simply reuse the same code from `generate()` (scroll down to the `Mistral7B` example for more details). However, this would make `a_generate()` a blocking process, regardless of whether you've turned on `async_mode` for a metric or not. +::: + +Lastly, to use it for evaluation for an LLM-Eval: ```python from deepeval.metrics import AnswerRelevancyMetric @@ -160,7 +172,7 @@ class Mistral7B(DeepEvalBaseLLM): def load_model(self): return self.model - def _call(self, prompt: str) -> str: + def generate(self, prompt: str) -> str: model = self.load_model() device = "cuda" # the device to load the model onto @@ -171,6 +183,9 @@ class Mistral7B(DeepEvalBaseLLM): generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) return self.tokenizer.batch_decode(generated_ids)[0] + async def a_generate(self, prompt: str) -> str: + return self.generate(prompt) + def get_model_name(self): return "Mistral 7B" @@ -181,7 +196,33 @@ mistral_7b = Mistral7B(model=model, tokenizer=tokenizer) print(mistral_7b("Write me a joke")) ``` -Note that for this particular implementation, we initialized our `Mistral7B` model with an additional `tokenizer` parameter, as this is required in the decoding step of the `_call()` method, unlike the `AzureOpenAI` example above. Lastly, to use it for evaluation in LLM-based metrics: +Note that for this particular implementation, we initialized our `Mistral7B` model with an additional `tokenizer` parameter, as this is required in the decoding step of the `generate()` method. + +:::info +You'll notice we simply reused `generate()` in `a_generate()`, because unfortunately there's no asynchronous interface for Hugging Face's `transformers` library, and would make all metric executions a synchronous, blocking process. + +However, you can try offloading the generation process to a separate thread instead: + +```python +import asyncio + +class Mistral7B(DeepEvalBaseLLM): + # ... (existing code) ... + + async def a_generate(self, prompt: str) -> str: + loop = asyncio.get_running_loop() + return await loop.run_in_executor(None, self.generate, prompt) +``` + +Some additional considerations and reasons why you should be extra careful with this implementation: + +- Running the generation in a separate thread may not fully utilize GPU resources if the model is GPU-based. +- There could be potential performance implications of frequently switching between threads. +- You'd need to ensure thread safety if multiple async generations are happening concurrently and sharing resources. + +::: + +Lastly, to use your custom `Mistral7B` model for evaluation: ```python from deepeval.metrics import AnswerRelevancyMetric @@ -191,7 +232,7 @@ metric = AnswerRelevancyMetric(model=mistral_7b) ``` :::tip -You need to specify the custom evaluation model you would like to use through the `model` argument when instantiating an LLM-based metric. +You need to specify the custom evaluation model you created via the `model` argument when creating a metric. ::: ### AWS Bedrock Example @@ -212,10 +253,15 @@ class AWSBedrock(DeepEvalBaseLLM): def load_model(self): return self.model - def _call(self, prompt: str) -> str: + def generate(self, prompt: str) -> str: chat_model = self.load_model() return chat_model.invoke(prompt).content + def a_generate(self, prompt: str) -> str: + chat_model = self.load_model() + res = await chat_model.ainvoke(prompt) + return res.content + def get_model_name(self): return "Custom Azure OpenAI Model" @@ -247,11 +293,16 @@ All metrics in `deepeval`, including [custom metrics that you create](metrics-cu - can be executed via the `metric.measure()` method - can have its score accessed via `metric.score` +- can have its score reason accessed via `metric.reason` - can have its status accessed via `metric.is_successful()` - can be used to evaluate test cases or entire datasets, with or without Pytest. -- has a `threshold` that acts as the threshold for success. `metric.is_successful()` is only true if `metric.score` >= `threshold`. +- has a `threshold` that acts as the threshold for success. `metric.is_successful()` is only true if `metric.score` is above/below `threshold`. -In additional, most LLM-Evals in `deepeval` offers a reason for its score, which can be accessed via `metric.reason`. +In additional, all metrics in `deepeval` executes asynchronously by default. This behavior is something you can configure via the `async_mode` parameter when instantiating a metric. + +:::tip +Visit an individual metric page for a metric's full details. +::: Here's a quick example. diff --git a/docs/docs/metrics-llm-evals.mdx b/docs/docs/metrics-llm-evals.mdx index 6cab455b2..8a656a72e 100644 --- a/docs/docs/metrics-llm-evals.mdx +++ b/docs/docs/metrics-llm-evals.mdx @@ -4,7 +4,7 @@ title: G-Eval sidebar_label: G-Eval --- -G-Eval is a custom, LLM evaluated metric. This means its score is calculated using an LLM. G-Eval is the most verstile type of metric `deepeval` has to offer, and is capable of evaluating almost any use cases. +G-Eval is a custom, LLM evaluated metric. This means its score is calculated using an LLM. G-Eval is the most verstile type of metric `deepeval` has to offer, and is capable of evaluating almost any use case with human-like accuracy. ## Required Arguments @@ -32,7 +32,7 @@ coherence_metric = GEval( ) ``` -There are three mandatory and two optional parameters required when instantiating an `GEval` class: +There are three mandatory and five optional parameters required when instantiating an `GEval` class: - `name`: name of metric - `criteria`: a description outlining the specific evaluation aspects for each test case. @@ -40,6 +40,8 @@ There are three mandatory and two optional parameters required when instantiatin - [Optional] `evaluation_steps`: a list of strings outlining the exact steps the LLM should take for evaluation. You can only provide either `evaluation_steps` **OR** `criteria`, and not both. - [Optional] `threshold`: the passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. :::danger For accurate and valid results, only the parameters that are mentioned in `criteria` should be included as a member of `evaluation_params`. diff --git a/docs/docs/metrics-summarization.mdx b/docs/docs/metrics-summarization.mdx index d06082f53..b0b281021 100644 --- a/docs/docs/metrics-summarization.mdx +++ b/docs/docs/metrics-summarization.mdx @@ -65,14 +65,15 @@ print(metric.reason) evaluate([test_case], [metric]) ``` -There are six optional parameters when instantiating an `SummarizationMetric` class: +There are seven optional parameters when instantiating an `SummarizationMetric` class: - [Optional] `threshold`: the passing threshold, defaulted to 0.5. -- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `assessment_questions`: a list of **close-ended questions that can be answered with either a 'yes' or a 'no'**. These are questions you want your summary to be able to ideally answer, and is especially helpful if you already know what a good summary for your use case looks like. If `assessment_questions` is not provided, we will generate a set of `assessment_questions` for you at evaluation time. The `assessment_questions` are used to calculate the `coverage_score`. - [Optional] `n`: the number of assessment questions to generate when `assessment_questions` is not provided. Defaulted to 5. +- [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. -- [Optional] `multithreading`: a boolean which when set to `True`, enables concurrent evaluation of said metric. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to True, enforces a strict evaluation criterion. In strict mode, the metric score becomes binary: a score of 1 indicates a perfect result, and any outcome less than perfect is scored as 0. Defaulted as `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx index 8fc320f2b..bb5543ff6 100644 --- a/docs/docs/metrics-toxicity.mdx +++ b/docs/docs/metrics-toxicity.mdx @@ -42,6 +42,8 @@ There are three optional parameters when creating a `ToxicityMetric`: - [Optional] `threshold`: a float representing the maximum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. :::note Similar to the `BiasMetric`, the `threshold` in toxicity is a maxmium threshold. From 1df8aad1968867089d3ed4a4e3c6b4863bf6ff28 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 10 Mar 2024 00:37:51 +0800 Subject: [PATCH 58/59] upated docs --- docs/docs/metrics-answer-relevancy.mdx | 2 +- docs/docs/metrics-bias.mdx | 2 +- docs/docs/metrics-contextual-precision.mdx | 2 +- docs/docs/metrics-contextual-recall.mdx | 2 +- docs/docs/metrics-contextual-relevancy.mdx | 2 +- docs/docs/metrics-faithfulness.mdx | 2 +- docs/docs/metrics-hallucination.mdx | 2 +- docs/docs/metrics-introduction.mdx | 55 ++++++++++++++++++++-- docs/docs/metrics-llm-evals.mdx | 2 +- docs/docs/metrics-summarization.mdx | 2 +- docs/docs/metrics-toxicity.mdx | 4 +- 11 files changed, 61 insertions(+), 16 deletions(-) diff --git a/docs/docs/metrics-answer-relevancy.mdx b/docs/docs/metrics-answer-relevancy.mdx index b79c4634e..868dfc756 100644 --- a/docs/docs/metrics-answer-relevancy.mdx +++ b/docs/docs/metrics-answer-relevancy.mdx @@ -53,7 +53,7 @@ There are five optional parameters when creating an `AnswerRelevancyMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-bias.mdx b/docs/docs/metrics-bias.mdx index e18b8f97c..90bb6c7a1 100644 --- a/docs/docs/metrics-bias.mdx +++ b/docs/docs/metrics-bias.mdx @@ -43,7 +43,7 @@ There are five optional parameters when creating a `BiasMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. :::note Unlike other metrics you've seen so far, the `threshold` for the `BiasMetric` is instead a maxmium threshold. diff --git a/docs/docs/metrics-contextual-precision.mdx b/docs/docs/metrics-contextual-precision.mdx index adaab4e73..1d803b5c7 100644 --- a/docs/docs/metrics-contextual-precision.mdx +++ b/docs/docs/metrics-contextual-precision.mdx @@ -59,7 +59,7 @@ There are five optional parameters when creating a `ContextualPrecisionMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-contextual-recall.mdx b/docs/docs/metrics-contextual-recall.mdx index f4bb5efa4..ca70e0d8f 100644 --- a/docs/docs/metrics-contextual-recall.mdx +++ b/docs/docs/metrics-contextual-recall.mdx @@ -59,7 +59,7 @@ There are five optional parameters when creating a `ContextualRecallMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-contextual-relevancy.mdx b/docs/docs/metrics-contextual-relevancy.mdx index 2bfe7def8..09d75106f 100644 --- a/docs/docs/metrics-contextual-relevancy.mdx +++ b/docs/docs/metrics-contextual-relevancy.mdx @@ -58,7 +58,7 @@ There are five optional parameters when creating a `ContextualRelevancyMetricMet - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-faithfulness.mdx b/docs/docs/metrics-faithfulness.mdx index a7e3ec206..5e8cd1e5d 100644 --- a/docs/docs/metrics-faithfulness.mdx +++ b/docs/docs/metrics-faithfulness.mdx @@ -58,7 +58,7 @@ There are five optional parameters when creating a `FaithfulnessMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-hallucination.mdx b/docs/docs/metrics-hallucination.mdx index e6b852b69..d53e44366 100644 --- a/docs/docs/metrics-hallucination.mdx +++ b/docs/docs/metrics-hallucination.mdx @@ -58,7 +58,7 @@ There are five optional parameters when creating a `HallucinationMetric`: - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-introduction.mdx b/docs/docs/metrics-introduction.mdx index 4bb83e229..2a4656ac6 100644 --- a/docs/docs/metrics-introduction.mdx +++ b/docs/docs/metrics-introduction.mdx @@ -292,16 +292,17 @@ metric = AnswerRelevancyMetric(model=aws_bedrock) All metrics in `deepeval`, including [custom metrics that you create](metrics-custom): - can be executed via the `metric.measure()` method -- can have its score accessed via `metric.score` +- can have its score accessed via `metric.score`, which ranges from 0 - 1 - can have its score reason accessed via `metric.reason` - can have its status accessed via `metric.is_successful()` - can be used to evaluate test cases or entire datasets, with or without Pytest. - has a `threshold` that acts as the threshold for success. `metric.is_successful()` is only true if `metric.score` is above/below `threshold`. +- has a `strict_mode` property, which enforces `metric.score` to a binary one In additional, all metrics in `deepeval` executes asynchronously by default. This behavior is something you can configure via the `async_mode` parameter when instantiating a metric. :::tip -Visit an individual metric page for a metric's full details. +Visit an individual metric page to learn how they are calculated, and what is required when creating an `LLMTestCase` in order to execute it. ::: Here's a quick example. @@ -335,7 +336,7 @@ print(metric.score) print(metric.reason) ``` -Or you can either evaluate a test case using `deepeval test run`: +Or you can either assert a test case using [`assert_test()` via `deepeval test run`](evaluation-test-cases#assert-a-test-case): ```python title="test_file.py" from deepeval import assert_test @@ -349,7 +350,7 @@ def test_answer_relevancy(): deepeval test run test_file.py ``` -Or using the `evaluate` function: +Or using the [`evaluate` function:](evaluation-test-cases#evaluate-test-cases-in-bulk) ```python from deepeval import evaluate @@ -358,4 +359,48 @@ from deepeval import evaluate evaluate([test_case], [metric]) ``` -For more details on how a metric evaluates a test case, refer to the [test cases section.](evaluation-test-cases#assert-a-test-case) +## Measuring Metrics in Async + +When a metric's `async_mode=True` (which is the default value for all metrics), invocations of `metric.measure()` will execute its internal algorithms concurrently. However, it's important to note that while operations **INSIDE** `measure()` executes concurrently, the `metric.measure()` call itself still blocks the main thread. + +:::info +Let's take the [`FaithfulnessMetric` algorithm](metrics-faithfulness#how-is-it-calculated) for example: + +1. **Extract all factual claims** made in the `actual_output` +2. **Extract all factual truths** found in the `retrieval_context` +3. **Compare extracted claims and truths** to generate a final score and reason. + +```python +from deepeval.metrics import FaithfulnessMetric +... + +metric = FaithfulnessMetric(async_mode=True) +metric.measure(test_case) +print("Metric finished!") +``` + +When `async_mode=True`, steps 1 and 2 executes concurrently (ie. at the same time) since they are independent of each other, while `async_mode=False` will cause steps 1 and 2 to execute sequentially instead (ie. one after the other). + +In both cases, "Metric finished!" will wait for `metric.measure()` to finish running before printing, but setting `async_mode` to `True` would make the print statement appear earlier, as `async_mode=True` allows `metric.measure()` to run faster. + +::: + +To measure multiple metrics at once and **NOT** block the main thread, use the asynchronous `a_measure()` method instead. + +```python +import asyncio +... + +# Remember to use async +async def long_running_function(): + # These will all run at the same time + await asyncio.gather( + metric1.a_measure(test_case), + metric2.a_measure(test_case), + metric3.a_measure(test_case), + metric4.a_measure(test_case) + ) + print("Metrics finished!") + +asyncio.run(long_running_function()) +``` diff --git a/docs/docs/metrics-llm-evals.mdx b/docs/docs/metrics-llm-evals.mdx index 8a656a72e..b3d461096 100644 --- a/docs/docs/metrics-llm-evals.mdx +++ b/docs/docs/metrics-llm-evals.mdx @@ -41,7 +41,7 @@ There are three mandatory and five optional parameters required when instantiati - [Optional] `threshold`: the passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. :::danger For accurate and valid results, only the parameters that are mentioned in `criteria` should be included as a member of `evaluation_params`. diff --git a/docs/docs/metrics-summarization.mdx b/docs/docs/metrics-summarization.mdx index b0b281021..07b5734c1 100644 --- a/docs/docs/metrics-summarization.mdx +++ b/docs/docs/metrics-summarization.mdx @@ -73,7 +73,7 @@ There are seven optional parameters when instantiating an `SummarizationMetric` - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to True, enforces a strict evaluation criterion. In strict mode, the metric score becomes binary: a score of 1 indicates a perfect result, and any outcome less than perfect is scored as 0. Defaulted as `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. ## How Is It Calculated? diff --git a/docs/docs/metrics-toxicity.mdx b/docs/docs/metrics-toxicity.mdx index bb5543ff6..5c1c10f77 100644 --- a/docs/docs/metrics-toxicity.mdx +++ b/docs/docs/metrics-toxicity.mdx @@ -37,13 +37,13 @@ print(metric.score) print(metric.reason) ``` -There are three optional parameters when creating a `ToxicityMetric`: +There are five optional parameters when creating a `ToxicityMetric`: - [Optional] `threshold`: a float representing the maximum passing threshold, defaulted to 0.5. - [Optional] `model`: a string specifying which of OpenAI's GPT models to use, **OR** [any custom LLM model](metrics-introduction#using-a-custom-llm) of type `DeepEvalBaseLLM`. Defaulted to 'gpt-4-0125-preview'. - [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. - [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 0 for perfection, 1 otherwise. It also overrides the current threshold and sets it to 0. Defaulted to `False`. -- [Optional] `async_mode`: a boolean which when set to `True`, enables concurrent execution of said metric. Defaulted to `True`. +- [Optional] `async_mode`: a boolean which when set to `True`, enables [concurrent execution within the `measure()` method.](metrics-introduction#measuring-a-metric-in-async) Defaulted to `True`. :::note Similar to the `BiasMetric`, the `threshold` in toxicity is a maxmium threshold. From 215a1baad946ff214806a1255d5024611595cc87 Mon Sep 17 00:00:00 2001 From: Jeffrey Ip Date: Sun, 10 Mar 2024 01:32:59 +0800 Subject: [PATCH 59/59] updated docs --- docs/docs/evaluation-introduction.mdx | 4 ++-- docs/docs/evaluation-test-cases.mdx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/docs/evaluation-introduction.mdx b/docs/docs/evaluation-introduction.mdx index 22a9a216c..73a68504a 100644 --- a/docs/docs/evaluation-introduction.mdx +++ b/docs/docs/evaluation-introduction.mdx @@ -129,7 +129,7 @@ deepeval test run test_example.py There are two mandatory and one optional parameter when calling the `assert_test()` function: -- `test_case`: a `LLMTestCase` +- `test_case`: an `LLMTestCase` - `metrics`: a list of metrics of type `BaseMetric` - [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. @@ -174,7 +174,7 @@ evaluate(dataset, [answer_relevancy_metric]) There are two mandatory and three optional arguments when calling the `evaluate()` function: -- `test_case`: a `LLMTestCase` +- `test_case`: an `LLMTestCase` - `metrics`: a list of metrics of type `BaseMetric` - [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. - [Optional] `show_indicator`: a boolean which when set to `True`, shows the progress indicator for each individual metric. Defaulted to `True`. diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx index 3a93e0ac5..04f8a7349 100644 --- a/docs/docs/evaluation-test-cases.mdx +++ b/docs/docs/evaluation-test-cases.mdx @@ -266,7 +266,7 @@ def test_case_2(): There are two mandatory and one optional parameter when calling the `assert_test()` function: -- `test_case`: a `LLMTestCase` +- `test_case`: an `LLMTestCase` - `metrics`: a list of metrics of type `BaseMetric` - [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. @@ -324,7 +324,7 @@ evaluate(test_cases, [metric]) There are two mandatory and three optional arguments when calling the `evaluate()` function: -- `test_case`: a `LLMTestCase` +- `test_case`: an `LLMTestCase` - `metrics`: a list of metrics of type `BaseMetric` - [Optional] `run_async`: a boolean which when set to `True`, enables concurrent evaluation of all metrics. Defaulted to `True`. - [Optional] `show_indicator`: a boolean which when set to `True`, shows the progress indicator for each individual metric. Defaulted to `True`.