diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py index 9ac5f8409..810c1d992 100644 --- a/deepeval/dataset/utils.py +++ b/deepeval/dataset/utils.py @@ -1,6 +1,6 @@ from typing import List, Optional, Any from deepeval.dataset.api import Golden, ConversationalGolden -from deepeval.test_case import LLMTestCase, ConversationalTestCase +from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase import json import re @@ -27,10 +27,10 @@ def convert_goldens_to_test_cases( goldens: List[Golden], _alias: Optional[str] = None, _id: Optional[str] = None, -) -> List[LLMTestCase]: +) -> List[LLMTestCase | MLLMTestCase]: test_cases = [] for index, golden in enumerate(goldens): - test_case = LLMTestCase( + test_case = LLMTestCase | MLLMTestCase( input=golden.input, actual_output=golden.actual_output, expected_output=golden.expected_output, diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py index 4e82eb0fe..4f0fc5975 100644 --- a/deepeval/evaluate.py +++ b/deepeval/evaluate.py @@ -234,6 +234,8 @@ def create_api_test_case( name=name, multimodalInput=test_case.input, multimodalActualOutput=test_case.actual_output, + toolsCalled=test_case.tools_called, + expectedTools=test_case.expected_tools, success=success, metricsData=metrics_data, runDuration=None, diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py index 1e43d5773..4dc780b00 100644 --- a/deepeval/metrics/__init__.py +++ b/deepeval/metrics/__init__.py @@ -41,4 +41,5 @@ MultimodalContextualPrecisionMetric, MultimodalAnswerRelevancyMetric, MultimodalFaithfulnessMetric, + MultimodalToolCorrectnessMetric, ) diff --git a/deepeval/metrics/multimodal_metrics/__init__.py b/deepeval/metrics/multimodal_metrics/__init__.py index dd49829f6..aa4e27775 100644 --- a/deepeval/metrics/multimodal_metrics/__init__.py +++ b/deepeval/metrics/multimodal_metrics/__init__.py @@ -18,3 +18,7 @@ from .multimodal_faithfulness.multimodal_faithfulness import ( MultimodalFaithfulnessMetric, ) + +from .multimodal_tool_correctness.multimodal_tool_correctness import ( + MultimodalToolCorrectnessMetric, +) diff --git a/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py new file mode 100644 index 000000000..121fae9f4 --- /dev/null +++ b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py @@ -0,0 +1,275 @@ +from typing import List, Dict + +from deepeval.metrics.indicator import metric_progress_indicator +from deepeval.metrics.utils import ( + construct_verbose_logs, + check_llm_test_case_params, +) +from deepeval.test_case import ( + MLLMTestCase, + MLLMTestCaseParams, + ToolCallParams, + ToolCall, +) +from deepeval.metrics import BaseMetric + +# Simplified required params for MLLM only +required_params: List[MLLMTestCaseParams] = [ + MLLMTestCaseParams.INPUT, + MLLMTestCaseParams.ACTUAL_OUTPUT, + MLLMTestCaseParams.TOOLS_CALLED, + MLLMTestCaseParams.EXPECTED_TOOLS, +] + + +class MultimodalToolCorrectnessMetric(BaseMetric): + def __init__( + self, + threshold: float = 0.5, + evaluation_params: List[ToolCallParams] = [], + include_reason: bool = True, + strict_mode: bool = False, + verbose_mode: bool = False, + should_exact_match: bool = False, + should_consider_ordering: bool = False, + ): + self.threshold = 1 if strict_mode else threshold + self.include_reason = include_reason + self.strict_mode = strict_mode + self.verbose_mode = verbose_mode + self.evaluation_params: List[ToolCallParams] = evaluation_params + self.should_exact_match = should_exact_match + self.should_consider_ordering = should_consider_ordering + + def measure( + self, + test_case: MLLMTestCase, + _show_indicator: bool = True, + ) -> float: + check_llm_test_case_params(test_case, required_params, self) + self.test_case = test_case + with metric_progress_indicator(self, _show_indicator=_show_indicator): + self.tools_called: List[ToolCall] = test_case.tools_called + self.expected_tools: List[ToolCall] = test_case.expected_tools + self.score = self._calculate_score() + self.reason = self._generate_reason() + self.success = self.score >= self.threshold + expected_tools_formatted = ( + "Expected Tools:\n[\n" + + ",\n".join( + self.indent_multiline_string( + repr(tool_call), indent_level=4 + ) + for tool_call in self.expected_tools + ) + + "\n]" + ) + tools_called_formatted = ( + "Tools Called:\n[\n" + + ",\n".join( + self.indent_multiline_string( + repr(tool_call), indent_level=4 + ) + for tool_call in self.tools_called + ) + + "\n]" + ) + steps = [ + f"{expected_tools_formatted}", + f"{tools_called_formatted}", + ] + steps.append(f"Score: {self.score}\nReason: {self.reason}") + self.verbose_logs = construct_verbose_logs(self, steps=steps) + return self.score + + async def a_measure( + self, test_case: MLLMTestCase, _show_indicator: bool = True + ) -> float: + return self.measure(test_case, _show_indicator=_show_indicator) + + ################################################## + ### Tool Correctness (Tool) ###################### + ################################################## + + def _generate_reason(self): + tools_called_names = [ + tool_called.name for tool_called in self.tools_called + ] + expected_tools_names = [ + expected_tool.name for expected_tool in self.expected_tools + ] + + if self.should_exact_match: + return f"{'Exact match' if self._calculate_exact_match_score() else 'Not an exact match'}: expected {tools_called_names}, called {expected_tools_names}. See details above." + + elif self.should_consider_ordering: + lcs, weighted_length = self._compute_weighted_lcs() + score = weighted_length / len(expected_tools_names) + missing = set(expected_tools_names) - set(tools_called_names) + out_of_order = set(expected_tools_names) - set( + [tool.name for tool in lcs] + ) + if score == 1: + return f"Correct ordering: all expected tools {expected_tools_names} were called in the correct order." + else: + issues = [] + if missing: + issues.append(f"missing tools {list(missing)}") + if out_of_order: + issues.append(f"out-of-order tools {list(out_of_order)}") + return f"Incorrect tool usage: {' and '.join(issues)}; expected {expected_tools_names}, called {tools_called_names}. See more details above." + else: + used_expected = set(self.tools_called).intersection( + set(self.expected_tools) + ) + missing = set(self.expected_tools) - used_expected + if self._calculate_non_exact_match_score() == 1: + return f"All expected tools {expected_tools_names} were called (order not considered)." + else: + return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above." + + ################################################## + ### Score Helper Functions ####################### + ################################################## + + # Calculate score + def _calculate_score(self): + if self.should_exact_match: + score = self._calculate_exact_match_score() + elif self.should_consider_ordering: + _, weighted_length = self._compute_weighted_lcs() + score = weighted_length / len(self.expected_tools) + else: + score = self._calculate_non_exact_match_score() + return 0 if self.strict_mode and score < self.threshold else score + + # Exact matching score + def _calculate_exact_match_score(self): + if len(self.tools_called) != len(self.expected_tools): + return 0.0 + for i in range(len(self.tools_called)): + if self.tools_called[i].name != self.expected_tools[i].name: + return 0.0 + if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params: + if ( + self.tools_called[i].input_parameters + != self.expected_tools[i].input_parameters + ): + return 0.0 + if ToolCallParams.OUTPUT in self.evaluation_params: + if self.tools_called[i].output != self.expected_tools[i].output: + return 0.0 + return 1.0 + + # Non exact matching score + def _calculate_non_exact_match_score(self): + total_score = 0.0 + matched_called_tools = set() + for expected_tool in self.expected_tools: + best_score = 0.0 + for called_tool in self.tools_called: + if called_tool in matched_called_tools: + continue + if expected_tool.name == called_tool.name: + match_score = 1.0 + if ( + ToolCallParams.INPUT_PARAMETERS + in self.evaluation_params + ): + match_score *= self._compare_dicts( + expected_tool.input_parameters, + called_tool.input_parameters, + ) + if ( + ToolCallParams.OUTPUT in self.evaluation_params + and expected_tool.output != called_tool.output + ): + match_score = 0.0 + if match_score > best_score: + best_score = match_score + best_called_tool = called_tool + if best_score > 0: + total_score += best_score + matched_called_tools.add(best_called_tool) + return ( + total_score / len(self.expected_tools) + if self.expected_tools + else 0.0 + ) + + # Consider ordering score + def _compute_weighted_lcs(self): + m, n = len(self.expected_tools), len(self.tools_called) + dp = [[0.0] * (n + 1) for _ in range(m + 1)] + for i in range(1, m + 1): + for j in range(1, n + 1): + expected_tool, called_tool = ( + self.expected_tools[i - 1], + self.tools_called[j - 1], + ) + if expected_tool.name != called_tool.name: + dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) + continue + score = 1.0 + if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params: + score *= self._compare_dicts( + expected_tool.input_parameters, + called_tool.input_parameters, + ) + if ( + ToolCallParams.OUTPUT in self.evaluation_params + and expected_tool.output != called_tool.output + ): + score = 0.0 + dp[i][j] = max( + dp[i - 1][j], + dp[i][j - 1], + dp[i - 1][j - 1] + score if score > 0 else 0, + ) + i, j, total_score = m, n, 0.0 + lcs = [] + while i > 0 and j > 0: + if dp[i][j] == dp[i - 1][j]: + i -= 1 + elif dp[i][j] == dp[i][j - 1]: + j -= 1 + else: + lcs.append(self.expected_tools[i - 1]) + total_score += dp[i][j] - dp[i - 1][j - 1] + i, j = i - 1, j - 1 + return lcs[::-1], total_score + + # For matching input parameters + def _compare_dicts(self, dict1: Dict, dict2: Dict): + if self.should_exact_match: + return 1.0 if dict1 == dict2 else 0.0 + match_score = 0 + matched_keys = set(dict1.keys()).intersection(set(dict2.keys())) + total_keys = set(dict1.keys()).union(set(dict2.keys())) + for key in matched_keys: + if dict1[key] == dict2[key]: + match_score += 1 / len(total_keys) + elif isinstance(dict1[key], dict) and isinstance(dict2[key], dict): + match_score += self._compare_dicts( + dict1[key], dict2[key] + ) / len(total_keys) + return match_score + + ################################################## + ### Others ####################################### + ################################################## + + def is_successful(self) -> bool: + try: + self.success = self.score >= self.threshold + except: + self.success = False + return self.success + + @property + def __name__(self): + return "Tool Correctness" + + def indent_multiline_string(self, s, indent_level=4): + indent = " " * indent_level + return "\n".join(f"{indent}{line}" for line in s.splitlines()) diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py index a2583ff59..501e4bab3 100644 --- a/deepeval/metrics/tool_correctness/tool_correctness.py +++ b/deepeval/metrics/tool_correctness/tool_correctness.py @@ -274,4 +274,4 @@ def __name__(self): def indent_multiline_string(self, s, indent_level=4): indent = " " * indent_level - return "\n".join(f"{indent}{line}" for line in s.splitlines()) + return "\n".join(f"{indent}{line}" for line in s.splitlines()) \ No newline at end of file diff --git a/deepeval/test_case/mllm_test_case.py b/deepeval/test_case/mllm_test_case.py index 474d08f30..73ceb8cd0 100644 --- a/deepeval/test_case/mllm_test_case.py +++ b/deepeval/test_case/mllm_test_case.py @@ -1,8 +1,10 @@ import os from urllib.parse import urlparse from dataclasses import dataclass, field -from typing import List, Optional, Dict, Union +from typing import List, Optional, Dict, Union, Any from enum import Enum +import json +from pydantic import BaseModel, Field @dataclass @@ -26,6 +28,78 @@ def is_local_path(url): return False +class ToolCall(BaseModel): + name: str + description: Optional[str] = None + reasoning: Optional[str] = None + output: Optional[Any] = None + input_parameters: Optional[Dict[str, Any]] = Field( + None, serialization_alias="inputParameters" + ) + + def __eq__(self, other): + if not isinstance(other, ToolCall): + return False + return ( + self.name == other.name + and self.input_parameters == other.input_parameters + and self.output == other.output + ) + + def __hash__(self): + input_params = ( + self.input_parameters if self.input_parameters is not None else {} + ) + output_hashable = ( + frozenset(self.output.items()) + if isinstance(self.output, dict) + else self.output + ) + return hash( + (self.name, frozenset(input_params.items()), output_hashable) + ) + + def __repr__(self): + fields = [] + + # Add basic fields + if self.name: + fields.append(f'name="{self.name}"') + if self.description: + fields.append(f'description="{self.description}"') + if self.reasoning: + fields.append(f'reasoning="{self.reasoning}"') + + # Handle nested fields like input_parameters + if self.input_parameters: + formatted_input = json.dumps(self.input_parameters, indent=4) + formatted_input = self._indent_nested_field( + "input_parameters", formatted_input + ) + fields.append(formatted_input) + + # Handle nested fields like output + if isinstance(self.output, dict): + formatted_output = json.dumps(self.output, indent=4) + formatted_output = self._indent_nested_field( + "output", formatted_output + ) + fields.append(formatted_output) + elif self.output is not None: + fields.append(f"output={repr(self.output)}") + + # Combine fields with proper formatting + fields_str = ",\n ".join(fields) + return f"ToolCall(\n {fields_str}\n)" + + @staticmethod + def _indent_nested_field(field_name: str, formatted_field: str) -> str: + """Helper method to indent multi-line fields for better readability.""" + lines = formatted_field.splitlines() + return f"{field_name}={lines[0]}\n" + "\n".join( + f" {line}" for line in lines[1:] + ) + class MLLMTestCaseParams(Enum): INPUT = "input" @@ -33,6 +107,8 @@ class MLLMTestCaseParams(Enum): EXPECTED_OUTPUT = "expected_output" CONTEXT = "context" RETRIEVAL_CONTEXT = "retrieval_context" + TOOLS_CALLED = "tools_called" + EXPECTED_TOOLS = "expected_tools" @dataclass @@ -44,6 +120,8 @@ class MLLMTestCase: retrieval_context: Optional[List[Union[str, MLLMImage]]] = None additional_metadata: Optional[Dict] = None comments: Optional[str] = None + tools_called: Optional[List[ToolCall]] = None + expected_tools: Optional[List[ToolCall]] = None name: Optional[str] = field(default=None) _dataset_rank: Optional[int] = field(default=None, repr=False) _dataset_alias: Optional[str] = field(default=None, repr=False) @@ -78,3 +156,21 @@ def __post_init__(self): raise TypeError( "'retrieval_context' must be None or a list of strings or MLLMImage instances" ) + + # Ensure `tools_called` is None or a list of strings + if self.tools_called is not None: + if not isinstance(self.tools_called, list) or not all( + isinstance(item, ToolCall) for item in self.tools_called + ): + raise TypeError( + "'tools_called' must be None or a list of `ToolCall`" + ) + + # Ensure `expected_tools` is None or a list of strings + if self.expected_tools is not None: + if not isinstance(self.expected_tools, list) or not all( + isinstance(item, ToolCall) for item in self.expected_tools + ): + raise TypeError( + "'expected_tools' must be None or a list of `ToolCall`" + ) diff --git a/docs/docs/multimodal-metrics-tool-correctness.mdx b/docs/docs/multimodal-metrics-tool-correctness.mdx new file mode 100644 index 000000000..a941cbab4 --- /dev/null +++ b/docs/docs/multimodal-metrics-tool-correctness.mdx @@ -0,0 +1,75 @@ +--- +id: multimodal-metrics-tool-correctness +title: Multimodal Tool Correctness +sidebar_label: Tool Correctness +--- + +import Equation from "@site/src/components/equation"; + +The multimodal tool correctness metric is an agentic LLM metric that assesses your multimodal LLM agent's function/tool calling ability. It is calculated by comparing whether every tool that is expected to be used was indeed called. + +:::info +The `MultimodalToolCorrectnessMetric` allows you to define the **strictness** of correctness. By default, it considers matching tool names to be correct, but you can also require input parameters and output to match. +::: + +## Required Arguments + +To use the `MultimodalToolCorrectnessMetric`, you'll have to provide the following arguments when creating an [`MLLMTestCase`](/docs/evaluation-test-cases#mllm-test-case): + +- `input` +- `actual_output` +- `tools_called` +- `expected_tools` + +## Example + +```python +from deepeval.metrics import MultimodalToolCorrectnessMetric +from deepeval.test_case import MLLMTestCase, ToolCall + +test_case = MLLMTestCase( + input="What's in this image?", + actual_output="The image shows a pair of running shoes.", + # Replace this with the tools that was actually used by your LLM agent + tools_called=[ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery")], + expected_tools=[ToolCall(name="ImageAnalysis")], +) + +metric = MultimodalToolCorrectnessMetric() +metric.measure(test_case) +print(metric.score) +print(metric.reason) +``` + +There are seven optional parameters when creating a `MultimodalToolCorrectnessMetric`: + +- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5. +- [Optional] `evaluation_params`: A list of `ToolCallParams` indicating the strictness of the correctness criteria, available options are `ToolCallParams.INPUT_PARAMETERS` and `ToolCallParams.OUTPUT`. For example, supplying a list containing `ToolCallParams.INPUT_PARAMETERS` but excluding `ToolCallParams.OUTPUT`, will deem a tool correct if the tool name and input parameters match, even if the output does not. Defaults to an empty list. +- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`. +- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`. +- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`. +- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=[ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery"), ToolCall(name="ImageAnalysis")]` and `tools_called=[ToolCall(name="ImageAnalysis"), ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery")]`, the metric will consider the tool calling to be incorrect. Only available for `ToolCallParams.TOOL` and defaulted to `False`. +- [Optional] `should_exact_match`: a boolean which when set to `True`, will require the `tools_called` and `expected_tools` to be exactly the same. Available for `ToolCallParams.TOOL` and `ToolCallParams.INPUT_PARAMETERS` and defaulted to `False`. + +:::info +Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`. +::: + +## How Is It Calculated? + +:::note +The `MultimodalToolCorrectnessMetric`, unlike all other `deepeval` metrics, is not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_called` parameters. +::: + +The **multimodal tool correctness metric** score is calculated according to the following equation: + + + +This metric assesses the accuracy of your agent's tool usage by comparing the `tools_called` by your multimodal LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent was called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_called` were called correctly. + +:::info +If `exact_match` is not specified and `ToolCall.INPUT_PARAMETERS` is included in `evaluation_params`, correctness may be a percentage score based on the proportion of correct input parameters (assuming the name and output are correct, if applicable). +:::