Merge pull request #414 from confident-ai/feature/cost-latency

added latency and cost metric
confident-ai · Jan 18, 2024 · e9a1045 · e9a1045
2 parents 294bf37 + 5c8923d
commit e9a1045
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 16 deletions.
diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -9,6 +9,8 @@
 from .contextual_recall import ContextualRecallMetric
 from .contextual_relevancy import ContextualRelevancyMetric
 from .contextual_precision import ContextualPrecisionMetric
+from .latency import LatencyMetric
+from .cost import CostMetric
 from .ragas_metric import (
     RagasMetric,
     RAGASAnswerRelevancyMetric,

diff --git a/deepeval/metrics/cost.py b/deepeval/metrics/cost.py
@@ -0,0 +1,19 @@
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+
+class CostMetric(BaseMetric):
+    def __init__(self, threshold: float):
+        self.threshold = threshold
+
+    def measure(self, test_case: LLMTestCase):
+        self.success = test_case.cost <= self.threshold
+        self.score = test_case.cost
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Cost"
diff --git a/deepeval/metrics/latency.py b/deepeval/metrics/latency.py
@@ -0,0 +1,19 @@
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+
+
+class LatencyMetric(BaseMetric):
+    def __init__(self, threshold: float):
+        self.threshold = threshold
+
+    def measure(self, test_case: LLMTestCase):
+        self.success = test_case.latency <= self.threshold
+        self.score = test_case.latency
+        return self.score
+
+    def is_successful(self):
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Latency"
diff --git a/deepeval/test_case.py b/deepeval/test_case.py
@@ -20,7 +20,7 @@ def __init__(
         expected_output: Optional[str] = None,
         context: Optional[List[str]] = None,
         retrieval_context: Optional[List[str]] = None,
-        execution_time: Optional[float] = None,
+        latency: Optional[float] = None,
         cost: Optional[float] = None,
         id: Optional[str] = None,
     ):
@@ -30,5 +30,5 @@ def __init__(
         self.expected_output = expected_output
         self.context = context
         self.retrieval_context = retrieval_context
-        self.execution_time = execution_time
+        self.latency = latency
         self.cost = cost
diff --git a/docs/docs/evaluation-test-cases.mdx b/docs/docs/evaluation-test-cases.mdx
@@ -13,7 +13,7 @@ A test case is a blueprint provided by `deepeval` to unit test LLM outputs based
 - [Optional] `expected_output`
 - [Optional] `context`
 - [Optional] `retrieval_context`
-- [Optional] `execution_time` (`float`)
+- [Optional] `latency` (`float`)
 - [Optional] `cost` (`float`)
 
 Except for `actual_output`, all parameters should originate from your evaluation dataset (if you have one). Here's an example implementation of a test case:
@@ -25,7 +25,7 @@ test_case = LLMTestCase(
     actual_output="We offer a 30-day full refund at no extra cost.",
     context=["All customers are eligible for a 30 day full refund at no extra cost."],
     retrieval_context=["Only shoes can be refunded."],
-    execution_time=10.0
+    latency=10.0
 )
 ```
 
@@ -174,32 +174,32 @@ test_case = LLMTestCase(
 Remember, `context` is the ideal retrieval results for a given input and typically come from your evaluation dataset, whereas `retrieval_context` is your LLM application's actual retrieval results.
 :::
 
-## Execution Time
+## Latency
 
-The `execution_time` is an **optional** parameter that represents how long it took your LLM application to finish executing. However, if you're trying to measure something else, like the execution time for only the retrieval part of your RAG pipeline, feel free to supply that number instead of the overall execution time.
+The `latency` is an **optional** parameter that represents how long it took your LLM application to finish executing. However, if you're trying to measure something else, like the latency for only the retrieval part of your RAG pipeline, feel free to supply that number instead of the overall latency.
 
 ```python
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
-    # Replace this with the actual execution time of your LLM application
-    execution_time=10.4
+    # Replace this with the actual latency of your LLM application
+    latency=10.4
 )
 ```
 
 :::note
-`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `execution_time` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `execution_time` parameter.
+`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `latency` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `latency` parameter.
 :::
 
 ## Cost
 
-The `cost` is an **optional** parameter that represents the token cost for a given execution of your LLM application. However, similar to `execution_time`, the `cost` parameter does not strictly have to be the total completion cost (eg., it could be the embedding cost), nor does it have to be in any set currency.
+The `cost` is an **optional** parameter that represents the token cost for a given execution of your LLM application. However, similar to `latency`, the `cost` parameter does not strictly have to be the total completion cost (eg., it could be the embedding cost), nor does it have to be in any set currency.
 
 ```python
 test_case = LLMTestCase(
     input="...",
     actual_output="...",
-    # Replace this with the actual execution time of your LLM application
+    # Replace this with the actual latency of your LLM application
     cost=0.78
 )
 ```

diff --git a/docs/docs/metrics-custom.mdx b/docs/docs/metrics-custom.mdx
@@ -31,7 +31,7 @@ class LatencyMetric(BaseMetric):
 
     def measure(self, test_case: LLMTestCase):
         # Set self.success and self.score in the "measure" method
-        self.success = test_case.execution_time <= self.threshold
+        self.success = test_case.latency <= self.threshold
         if self.success:
             self.score = 1
         else:
@@ -69,8 +69,8 @@ from deepeval.test_case import LLMTestCase
 ...
 
 
-# Note that we pass in execution time since the measure method requires it for evaluation
+# Note that we pass in latency since the measure method requires it for evaluation
 latency_metric = LatencyMetric(max_seconds=10.0)
-test_case = LLMTestCase(input="...", actual_output="...", execution_time=8.3)
+test_case = LLMTestCase(input="...", actual_output="...", latency=8.3)
 evaluate([test_case], [latency_metric])
 ```
diff --git a/tests/test_cost.py b/tests/test_cost.py
@@ -0,0 +1,9 @@
+from deepeval.metrics import CostMetric
+from deepeval.test_case import LLMTestCase
+from deepeval import assert_test
+
+
+def test_cost_metric():
+    metric = CostMetric(threshold=12)
+    test_case = LLMTestCase(input="...", actual_output="...", cost=12)
+    assert_test(test_case, [metric])
diff --git a/tests/test_custom_metric.py b/tests/test_custom_metric.py
@@ -13,7 +13,7 @@ def __init__(self, max_seconds: float = 10):
 
     def measure(self, test_case: LLMTestCase):
         # Set self.success and self.score in the "measure" method
-        self.success = test_case.execution_time <= self.threshold
+        self.success = test_case.latency <= self.threshold
         if self.success:
             self.score = 1
         else:
@@ -37,6 +37,6 @@ def test_length_metric():
     test_case = LLMTestCase(
         input="placeholder",
         actual_output="This is a long sentence that is more than 3 letters",
-        execution_time=8.3,
+        latency=8.3,
     )
     assert_test(test_case, [metric])
diff --git a/tests/test_latency.py b/tests/test_latency.py
@@ -0,0 +1,13 @@
+from deepeval.metrics import LatencyMetric
+from deepeval.test_case import LLMTestCase
+from deepeval import assert_test
+
+
+def test_latency_metric():
+    metric = LatencyMetric(threshold=12)
+    test_case = LLMTestCase(
+        input="...",
+        actual_output="...",
+        latency=8.3,
+    )
+    assert_test(test_case, [metric])