diff --git a/deepeval/dataset/utils.py b/deepeval/dataset/utils.py
index 9ac5f8409..810c1d992 100644
--- a/deepeval/dataset/utils.py
+++ b/deepeval/dataset/utils.py
@@ -1,6 +1,6 @@
 from typing import List, Optional, Any
 from deepeval.dataset.api import Golden, ConversationalGolden
-from deepeval.test_case import LLMTestCase, ConversationalTestCase
+from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
 import json
 import re
 
@@ -27,10 +27,10 @@ def convert_goldens_to_test_cases(
     goldens: List[Golden],
     _alias: Optional[str] = None,
     _id: Optional[str] = None,
-) -> List[LLMTestCase]:
+) -> List[LLMTestCase | MLLMTestCase]:
     test_cases = []
     for index, golden in enumerate(goldens):
-        test_case = LLMTestCase(
+        test_case = LLMTestCase | MLLMTestCase(
             input=golden.input,
             actual_output=golden.actual_output,
             expected_output=golden.expected_output,
diff --git a/deepeval/evaluate.py b/deepeval/evaluate.py
index 4e82eb0fe..4f0fc5975 100644
--- a/deepeval/evaluate.py
+++ b/deepeval/evaluate.py
@@ -234,6 +234,8 @@ def create_api_test_case(
                 name=name,
                 multimodalInput=test_case.input,
                 multimodalActualOutput=test_case.actual_output,
+                toolsCalled=test_case.tools_called,
+                expectedTools=test_case.expected_tools,
                 success=success,
                 metricsData=metrics_data,
                 runDuration=None,
diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
index 1e43d5773..4dc780b00 100644
--- a/deepeval/metrics/__init__.py
+++ b/deepeval/metrics/__init__.py
@@ -41,4 +41,5 @@
     MultimodalContextualPrecisionMetric,
     MultimodalAnswerRelevancyMetric,
     MultimodalFaithfulnessMetric,
+    MultimodalToolCorrectnessMetric,
 )
diff --git a/deepeval/metrics/multimodal_metrics/__init__.py b/deepeval/metrics/multimodal_metrics/__init__.py
index dd49829f6..aa4e27775 100644
--- a/deepeval/metrics/multimodal_metrics/__init__.py
+++ b/deepeval/metrics/multimodal_metrics/__init__.py
@@ -18,3 +18,7 @@
 from .multimodal_faithfulness.multimodal_faithfulness import (
     MultimodalFaithfulnessMetric,
 )
+
+from .multimodal_tool_correctness.multimodal_tool_correctness import (
+    MultimodalToolCorrectnessMetric,
+)
diff --git a/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py
new file mode 100644
index 000000000..121fae9f4
--- /dev/null
+++ b/deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py
@@ -0,0 +1,275 @@
+from typing import List, Dict
+
+from deepeval.metrics.indicator import metric_progress_indicator
+from deepeval.metrics.utils import (
+    construct_verbose_logs,
+    check_llm_test_case_params,
+)
+from deepeval.test_case import (
+    MLLMTestCase,
+    MLLMTestCaseParams,
+    ToolCallParams,
+    ToolCall,
+)
+from deepeval.metrics import BaseMetric
+
+# Simplified required params for MLLM only
+required_params: List[MLLMTestCaseParams] = [
+    MLLMTestCaseParams.INPUT,
+    MLLMTestCaseParams.ACTUAL_OUTPUT,
+    MLLMTestCaseParams.TOOLS_CALLED,
+    MLLMTestCaseParams.EXPECTED_TOOLS,
+]
+
+
+class MultimodalToolCorrectnessMetric(BaseMetric):
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        evaluation_params: List[ToolCallParams] = [],
+        include_reason: bool = True,
+        strict_mode: bool = False,
+        verbose_mode: bool = False,
+        should_exact_match: bool = False,
+        should_consider_ordering: bool = False,
+    ):
+        self.threshold = 1 if strict_mode else threshold
+        self.include_reason = include_reason
+        self.strict_mode = strict_mode
+        self.verbose_mode = verbose_mode
+        self.evaluation_params: List[ToolCallParams] = evaluation_params
+        self.should_exact_match = should_exact_match
+        self.should_consider_ordering = should_consider_ordering
+
+    def measure(
+        self,
+        test_case: MLLMTestCase,
+        _show_indicator: bool = True,
+    ) -> float:
+        check_llm_test_case_params(test_case, required_params, self)
+        self.test_case = test_case
+        with metric_progress_indicator(self, _show_indicator=_show_indicator):
+            self.tools_called: List[ToolCall] = test_case.tools_called
+            self.expected_tools: List[ToolCall] = test_case.expected_tools
+            self.score = self._calculate_score()
+            self.reason = self._generate_reason()
+            self.success = self.score >= self.threshold
+            expected_tools_formatted = (
+                "Expected Tools:\n[\n"
+                + ",\n".join(
+                    self.indent_multiline_string(
+                        repr(tool_call), indent_level=4
+                    )
+                    for tool_call in self.expected_tools
+                )
+                + "\n]"
+            )
+            tools_called_formatted = (
+                "Tools Called:\n[\n"
+                + ",\n".join(
+                    self.indent_multiline_string(
+                        repr(tool_call), indent_level=4
+                    )
+                    for tool_call in self.tools_called
+                )
+                + "\n]"
+            )
+            steps = [
+                f"{expected_tools_formatted}",
+                f"{tools_called_formatted}",
+            ]
+            steps.append(f"Score: {self.score}\nReason: {self.reason}")
+            self.verbose_logs = construct_verbose_logs(self, steps=steps)
+            return self.score
+
+    async def a_measure(
+        self, test_case: MLLMTestCase, _show_indicator: bool = True
+    ) -> float:
+        return self.measure(test_case, _show_indicator=_show_indicator)
+
+    ##################################################
+    ### Tool Correctness (Tool) ######################
+    ##################################################
+
+    def _generate_reason(self):
+        tools_called_names = [
+            tool_called.name for tool_called in self.tools_called
+        ]
+        expected_tools_names = [
+            expected_tool.name for expected_tool in self.expected_tools
+        ]
+
+        if self.should_exact_match:
+            return f"{'Exact match' if self._calculate_exact_match_score() else 'Not an exact match'}: expected {tools_called_names}, called {expected_tools_names}. See details above."
+
+        elif self.should_consider_ordering:
+            lcs, weighted_length = self._compute_weighted_lcs()
+            score = weighted_length / len(expected_tools_names)
+            missing = set(expected_tools_names) - set(tools_called_names)
+            out_of_order = set(expected_tools_names) - set(
+                [tool.name for tool in lcs]
+            )
+            if score == 1:
+                return f"Correct ordering: all expected tools {expected_tools_names} were called in the correct order."
+            else:
+                issues = []
+                if missing:
+                    issues.append(f"missing tools {list(missing)}")
+                if out_of_order:
+                    issues.append(f"out-of-order tools {list(out_of_order)}")
+                return f"Incorrect tool usage: {' and '.join(issues)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
+        else:
+            used_expected = set(self.tools_called).intersection(
+                set(self.expected_tools)
+            )
+            missing = set(self.expected_tools) - used_expected
+            if self._calculate_non_exact_match_score() == 1:
+                return f"All expected tools {expected_tools_names} were called (order not considered)."
+            else:
+                return f"Incomplete tool usage: missing tools {list(missing)}; expected {expected_tools_names}, called {tools_called_names}. See more details above."
+
+    ##################################################
+    ### Score Helper Functions #######################
+    ##################################################
+
+    # Calculate score
+    def _calculate_score(self):
+        if self.should_exact_match:
+            score = self._calculate_exact_match_score()
+        elif self.should_consider_ordering:
+            _, weighted_length = self._compute_weighted_lcs()
+            score = weighted_length / len(self.expected_tools)
+        else:
+            score = self._calculate_non_exact_match_score()
+        return 0 if self.strict_mode and score < self.threshold else score
+
+    # Exact matching score
+    def _calculate_exact_match_score(self):
+        if len(self.tools_called) != len(self.expected_tools):
+            return 0.0
+        for i in range(len(self.tools_called)):
+            if self.tools_called[i].name != self.expected_tools[i].name:
+                return 0.0
+            if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
+                if (
+                    self.tools_called[i].input_parameters
+                    != self.expected_tools[i].input_parameters
+                ):
+                    return 0.0
+            if ToolCallParams.OUTPUT in self.evaluation_params:
+                if self.tools_called[i].output != self.expected_tools[i].output:
+                    return 0.0
+        return 1.0
+
+    # Non exact matching score
+    def _calculate_non_exact_match_score(self):
+        total_score = 0.0
+        matched_called_tools = set()
+        for expected_tool in self.expected_tools:
+            best_score = 0.0
+            for called_tool in self.tools_called:
+                if called_tool in matched_called_tools:
+                    continue
+                if expected_tool.name == called_tool.name:
+                    match_score = 1.0
+                    if (
+                        ToolCallParams.INPUT_PARAMETERS
+                        in self.evaluation_params
+                    ):
+                        match_score *= self._compare_dicts(
+                            expected_tool.input_parameters,
+                            called_tool.input_parameters,
+                        )
+                    if (
+                        ToolCallParams.OUTPUT in self.evaluation_params
+                        and expected_tool.output != called_tool.output
+                    ):
+                        match_score = 0.0
+                    if match_score > best_score:
+                        best_score = match_score
+                        best_called_tool = called_tool
+            if best_score > 0:
+                total_score += best_score
+                matched_called_tools.add(best_called_tool)
+        return (
+            total_score / len(self.expected_tools)
+            if self.expected_tools
+            else 0.0
+        )
+
+    # Consider ordering score
+    def _compute_weighted_lcs(self):
+        m, n = len(self.expected_tools), len(self.tools_called)
+        dp = [[0.0] * (n + 1) for _ in range(m + 1)]
+        for i in range(1, m + 1):
+            for j in range(1, n + 1):
+                expected_tool, called_tool = (
+                    self.expected_tools[i - 1],
+                    self.tools_called[j - 1],
+                )
+                if expected_tool.name != called_tool.name:
+                    dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
+                    continue
+                score = 1.0
+                if ToolCallParams.INPUT_PARAMETERS in self.evaluation_params:
+                    score *= self._compare_dicts(
+                        expected_tool.input_parameters,
+                        called_tool.input_parameters,
+                    )
+                if (
+                    ToolCallParams.OUTPUT in self.evaluation_params
+                    and expected_tool.output != called_tool.output
+                ):
+                    score = 0.0
+                dp[i][j] = max(
+                    dp[i - 1][j],
+                    dp[i][j - 1],
+                    dp[i - 1][j - 1] + score if score > 0 else 0,
+                )
+        i, j, total_score = m, n, 0.0
+        lcs = []
+        while i > 0 and j > 0:
+            if dp[i][j] == dp[i - 1][j]:
+                i -= 1
+            elif dp[i][j] == dp[i][j - 1]:
+                j -= 1
+            else:
+                lcs.append(self.expected_tools[i - 1])
+                total_score += dp[i][j] - dp[i - 1][j - 1]
+                i, j = i - 1, j - 1
+        return lcs[::-1], total_score
+
+    # For matching input parameters
+    def _compare_dicts(self, dict1: Dict, dict2: Dict):
+        if self.should_exact_match:
+            return 1.0 if dict1 == dict2 else 0.0
+        match_score = 0
+        matched_keys = set(dict1.keys()).intersection(set(dict2.keys()))
+        total_keys = set(dict1.keys()).union(set(dict2.keys()))
+        for key in matched_keys:
+            if dict1[key] == dict2[key]:
+                match_score += 1 / len(total_keys)
+            elif isinstance(dict1[key], dict) and isinstance(dict2[key], dict):
+                match_score += self._compare_dicts(
+                    dict1[key], dict2[key]
+                ) / len(total_keys)
+        return match_score
+
+    ##################################################
+    ### Others #######################################
+    ##################################################
+
+    def is_successful(self) -> bool:
+        try:
+            self.success = self.score >= self.threshold
+        except:
+            self.success = False
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Tool Correctness"
+
+    def indent_multiline_string(self, s, indent_level=4):
+        indent = " " * indent_level
+        return "\n".join(f"{indent}{line}" for line in s.splitlines())
diff --git a/deepeval/metrics/tool_correctness/tool_correctness.py b/deepeval/metrics/tool_correctness/tool_correctness.py
index a2583ff59..501e4bab3 100644
--- a/deepeval/metrics/tool_correctness/tool_correctness.py
+++ b/deepeval/metrics/tool_correctness/tool_correctness.py
@@ -274,4 +274,4 @@ def __name__(self):
 
     def indent_multiline_string(self, s, indent_level=4):
         indent = " " * indent_level
-        return "\n".join(f"{indent}{line}" for line in s.splitlines())
+        return "\n".join(f"{indent}{line}" for line in s.splitlines())
\ No newline at end of file
diff --git a/deepeval/test_case/mllm_test_case.py b/deepeval/test_case/mllm_test_case.py
index 474d08f30..73ceb8cd0 100644
--- a/deepeval/test_case/mllm_test_case.py
+++ b/deepeval/test_case/mllm_test_case.py
@@ -1,8 +1,10 @@
 import os
 from urllib.parse import urlparse
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict, Union
+from typing import List, Optional, Dict, Union, Any
 from enum import Enum
+import json
+from pydantic import BaseModel, Field
 
 
 @dataclass
@@ -26,6 +28,78 @@ def is_local_path(url):
 
         return False
 
+class ToolCall(BaseModel):
+    name: str
+    description: Optional[str] = None
+    reasoning: Optional[str] = None
+    output: Optional[Any] = None
+    input_parameters: Optional[Dict[str, Any]] = Field(
+        None, serialization_alias="inputParameters"
+    )
+
+    def __eq__(self, other):
+        if not isinstance(other, ToolCall):
+            return False
+        return (
+            self.name == other.name
+            and self.input_parameters == other.input_parameters
+            and self.output == other.output
+        )
+
+    def __hash__(self):
+        input_params = (
+            self.input_parameters if self.input_parameters is not None else {}
+        )
+        output_hashable = (
+            frozenset(self.output.items())
+            if isinstance(self.output, dict)
+            else self.output
+        )
+        return hash(
+            (self.name, frozenset(input_params.items()), output_hashable)
+        )
+
+    def __repr__(self):
+        fields = []
+
+        # Add basic fields
+        if self.name:
+            fields.append(f'name="{self.name}"')
+        if self.description:
+            fields.append(f'description="{self.description}"')
+        if self.reasoning:
+            fields.append(f'reasoning="{self.reasoning}"')
+
+        # Handle nested fields like input_parameters
+        if self.input_parameters:
+            formatted_input = json.dumps(self.input_parameters, indent=4)
+            formatted_input = self._indent_nested_field(
+                "input_parameters", formatted_input
+            )
+            fields.append(formatted_input)
+
+        # Handle nested fields like output
+        if isinstance(self.output, dict):
+            formatted_output = json.dumps(self.output, indent=4)
+            formatted_output = self._indent_nested_field(
+                "output", formatted_output
+            )
+            fields.append(formatted_output)
+        elif self.output is not None:
+            fields.append(f"output={repr(self.output)}")
+
+        # Combine fields with proper formatting
+        fields_str = ",\n    ".join(fields)
+        return f"ToolCall(\n    {fields_str}\n)"
+
+    @staticmethod
+    def _indent_nested_field(field_name: str, formatted_field: str) -> str:
+        """Helper method to indent multi-line fields for better readability."""
+        lines = formatted_field.splitlines()
+        return f"{field_name}={lines[0]}\n" + "\n".join(
+            f"    {line}" for line in lines[1:]
+        )
+
 
 class MLLMTestCaseParams(Enum):
     INPUT = "input"
@@ -33,6 +107,8 @@ class MLLMTestCaseParams(Enum):
     EXPECTED_OUTPUT = "expected_output"
     CONTEXT = "context"
     RETRIEVAL_CONTEXT = "retrieval_context"
+    TOOLS_CALLED = "tools_called"
+    EXPECTED_TOOLS = "expected_tools"
 
 
 @dataclass
@@ -44,6 +120,8 @@ class MLLMTestCase:
     retrieval_context: Optional[List[Union[str, MLLMImage]]] = None
     additional_metadata: Optional[Dict] = None
     comments: Optional[str] = None
+    tools_called: Optional[List[ToolCall]] = None
+    expected_tools: Optional[List[ToolCall]] = None
     name: Optional[str] = field(default=None)
     _dataset_rank: Optional[int] = field(default=None, repr=False)
     _dataset_alias: Optional[str] = field(default=None, repr=False)
@@ -78,3 +156,21 @@ def __post_init__(self):
                 raise TypeError(
                     "'retrieval_context' must be None or a list of strings or MLLMImage instances"
                 )
+
+        # Ensure `tools_called` is None or a list of strings
+        if self.tools_called is not None:
+            if not isinstance(self.tools_called, list) or not all(
+                isinstance(item, ToolCall) for item in self.tools_called
+            ):
+                raise TypeError(
+                    "'tools_called' must be None or a list of `ToolCall`"
+                )
+
+        # Ensure `expected_tools` is None or a list of strings
+        if self.expected_tools is not None:
+            if not isinstance(self.expected_tools, list) or not all(
+                isinstance(item, ToolCall) for item in self.expected_tools
+            ):
+                raise TypeError(
+                    "'expected_tools' must be None or a list of `ToolCall`"
+                )
diff --git a/docs/docs/multimodal-metrics-tool-correctness.mdx b/docs/docs/multimodal-metrics-tool-correctness.mdx
new file mode 100644
index 000000000..a941cbab4
--- /dev/null
+++ b/docs/docs/multimodal-metrics-tool-correctness.mdx
@@ -0,0 +1,75 @@
+---
+id: multimodal-metrics-tool-correctness
+title: Multimodal Tool Correctness
+sidebar_label: Tool Correctness
+---
+
+import Equation from "@site/src/components/equation";
+
+The multimodal tool correctness metric is an agentic LLM metric that assesses your multimodal LLM agent's function/tool calling ability. It is calculated by comparing whether every tool that is expected to be used was indeed called.
+
+:::info
+The `MultimodalToolCorrectnessMetric` allows you to define the **strictness** of correctness. By default, it considers matching tool names to be correct, but you can also require input parameters and output to match.
+:::
+
+## Required Arguments
+
+To use the `MultimodalToolCorrectnessMetric`, you'll have to provide the following arguments when creating an [`MLLMTestCase`](/docs/evaluation-test-cases#mllm-test-case):
+
+- `input`
+- `actual_output`
+- `tools_called`
+- `expected_tools`
+
+## Example
+
+```python
+from deepeval.metrics import MultimodalToolCorrectnessMetric
+from deepeval.test_case import MLLMTestCase, ToolCall
+
+test_case = MLLMTestCase(
+    input="What's in this image?",
+    actual_output="The image shows a pair of running shoes.",
+    # Replace this with the tools that was actually used by your LLM agent
+    tools_called=[ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery")],
+    expected_tools=[ToolCall(name="ImageAnalysis")],
+)
+
+metric = MultimodalToolCorrectnessMetric()
+metric.measure(test_case)
+print(metric.score)
+print(metric.reason)
+```
+
+There are seven optional parameters when creating a `MultimodalToolCorrectnessMetric`:
+
+- [Optional] `threshold`: a float representing the minimum passing threshold, defaulted to 0.5.
+- [Optional] `evaluation_params`: A list of `ToolCallParams` indicating the strictness of the correctness criteria, available options are `ToolCallParams.INPUT_PARAMETERS` and `ToolCallParams.OUTPUT`. For example, supplying a list containing `ToolCallParams.INPUT_PARAMETERS` but excluding `ToolCallParams.OUTPUT`, will deem a tool correct if the tool name and input parameters match, even if the output does not. Defaults to an empty list.
+- [Optional] `include_reason`: a boolean which when set to `True`, will include a reason for its evaluation score. Defaulted to `True`.
+- [Optional] `strict_mode`: a boolean which when set to `True`, enforces a binary metric score: 1 for perfection, 0 otherwise. It also overrides the current threshold and sets it to 1. Defaulted to `False`.
+- [Optional] `verbose_mode`: a boolean which when set to `True`, prints the intermediate steps used to calculate said metric to the console, as outlined in the [How Is It Calculated](#how-is-it-calculated) section. Defaulted to `False`.
+- [Optional] `should_consider_ordering`: a boolean which when set to `True`, will consider the ordering in which the tools were called in. For example, if `expected_tools=[ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery"), ToolCall(name="ImageAnalysis")]` and `tools_called=[ToolCall(name="ImageAnalysis"), ToolCall(name="ImageAnalysis"), ToolCall(name="ToolQuery")]`, the metric will consider the tool calling to be incorrect. Only available for `ToolCallParams.TOOL` and defaulted to `False`.
+- [Optional] `should_exact_match`: a boolean which when set to `True`, will require the `tools_called` and `expected_tools` to be exactly the same. Available for `ToolCallParams.TOOL` and `ToolCallParams.INPUT_PARAMETERS` and defaulted to `False`.
+
+:::info
+Since `should_exact_match` is a stricter criteria than `should_consider_ordering`, setting `should_consider_ordering` will have no effect when `should_exact_match` is set to `True`.
+:::
+
+## How Is It Calculated?
+
+:::note
+The `MultimodalToolCorrectnessMetric`, unlike all other `deepeval` metrics, is not calculated using any models or LLMs, and instead via exact matching between the `expected_tools` and `tools_called` parameters.
+:::
+
+The **multimodal tool correctness metric** score is calculated according to the following equation:
+
+<Equation
+  formula="\text{Tool Correctness} = \frac{\text{Number of Correctly Used Tools (or Correct Input Parameters/Outputs)}}{\text{Total Number of Expected Tools}}
+"
+/>
+
+This metric assesses the accuracy of your agent's tool usage by comparing the `tools_called` by your multimodal LLM agent to the list of `expected_tools`. A score of 1 indicates that every tool utilized by your LLM agent was called correctly according to the list of `expected_tools`, `should_consider_ordering`, and `should_exact_match`, while a score of 0 signifies that none of the `tools_called` were called correctly.
+
+:::info
+If `exact_match` is not specified and `ToolCall.INPUT_PARAMETERS` is included in `evaluation_params`, correctness may be a percentage score based on the proportion of correct input parameters (assuming the name and output are correct, if applicable).
+:::