Skip to content

Commit

Permalink
Merge pull request #414 from confident-ai/feature/cost-latency
Browse files Browse the repository at this point in the history
added latency and cost metric
  • Loading branch information
penguine-ip authored Jan 18, 2024
2 parents 294bf37 + 5c8923d commit e9a1045
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 16 deletions.
2 changes: 2 additions & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from .contextual_recall import ContextualRecallMetric
from .contextual_relevancy import ContextualRelevancyMetric
from .contextual_precision import ContextualPrecisionMetric
from .latency import LatencyMetric
from .cost import CostMetric
from .ragas_metric import (
RagasMetric,
RAGASAnswerRelevancyMetric,
Expand Down
19 changes: 19 additions & 0 deletions deepeval/metrics/cost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase


class CostMetric(BaseMetric):
def __init__(self, threshold: float):
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
self.success = test_case.cost <= self.threshold
self.score = test_case.cost
return self.score

def is_successful(self):
return self.success

@property
def __name__(self):
return "Cost"
19 changes: 19 additions & 0 deletions deepeval/metrics/latency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase


class LatencyMetric(BaseMetric):
def __init__(self, threshold: float):
self.threshold = threshold

def measure(self, test_case: LLMTestCase):
self.success = test_case.latency <= self.threshold
self.score = test_case.latency
return self.score

def is_successful(self):
return self.success

@property
def __name__(self):
return "Latency"
4 changes: 2 additions & 2 deletions deepeval/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(
expected_output: Optional[str] = None,
context: Optional[List[str]] = None,
retrieval_context: Optional[List[str]] = None,
execution_time: Optional[float] = None,
latency: Optional[float] = None,
cost: Optional[float] = None,
id: Optional[str] = None,
):
Expand All @@ -30,5 +30,5 @@ def __init__(
self.expected_output = expected_output
self.context = context
self.retrieval_context = retrieval_context
self.execution_time = execution_time
self.latency = latency
self.cost = cost
18 changes: 9 additions & 9 deletions docs/docs/evaluation-test-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ A test case is a blueprint provided by `deepeval` to unit test LLM outputs based
- [Optional] `expected_output`
- [Optional] `context`
- [Optional] `retrieval_context`
- [Optional] `execution_time` (`float`)
- [Optional] `latency` (`float`)
- [Optional] `cost` (`float`)

Except for `actual_output`, all parameters should originate from your evaluation dataset (if you have one). Here's an example implementation of a test case:
Expand All @@ -25,7 +25,7 @@ test_case = LLMTestCase(
actual_output="We offer a 30-day full refund at no extra cost.",
context=["All customers are eligible for a 30 day full refund at no extra cost."],
retrieval_context=["Only shoes can be refunded."],
execution_time=10.0
latency=10.0
)
```

Expand Down Expand Up @@ -174,32 +174,32 @@ test_case = LLMTestCase(
Remember, `context` is the ideal retrieval results for a given input and typically come from your evaluation dataset, whereas `retrieval_context` is your LLM application's actual retrieval results.
:::

## Execution Time
## Latency

The `execution_time` is an **optional** parameter that represents how long it took your LLM application to finish executing. However, if you're trying to measure something else, like the execution time for only the retrieval part of your RAG pipeline, feel free to supply that number instead of the overall execution time.
The `latency` is an **optional** parameter that represents how long it took your LLM application to finish executing. However, if you're trying to measure something else, like the latency for only the retrieval part of your RAG pipeline, feel free to supply that number instead of the overall latency.

```python
test_case = LLMTestCase(
input="...",
actual_output="...",
# Replace this with the actual execution time of your LLM application
execution_time=10.4
# Replace this with the actual latency of your LLM application
latency=10.4
)
```

:::note
`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `execution_time` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `execution_time` parameter.
`deepeval` does not offer metrics that evaluate on latency and cost, so feel free to supply the `latency` in either seconds, miliseconds, or even nanoseconds. That being said, [here is a full working example](metrics-custom#implementation) of how you can build your own `LatencyMetric` using the `latency` parameter.
:::

## Cost

The `cost` is an **optional** parameter that represents the token cost for a given execution of your LLM application. However, similar to `execution_time`, the `cost` parameter does not strictly have to be the total completion cost (eg., it could be the embedding cost), nor does it have to be in any set currency.
The `cost` is an **optional** parameter that represents the token cost for a given execution of your LLM application. However, similar to `latency`, the `cost` parameter does not strictly have to be the total completion cost (eg., it could be the embedding cost), nor does it have to be in any set currency.

```python
test_case = LLMTestCase(
input="...",
actual_output="...",
# Replace this with the actual execution time of your LLM application
# Replace this with the actual latency of your LLM application
cost=0.78
)
```
Expand Down
6 changes: 3 additions & 3 deletions docs/docs/metrics-custom.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class LatencyMetric(BaseMetric):

def measure(self, test_case: LLMTestCase):
# Set self.success and self.score in the "measure" method
self.success = test_case.execution_time <= self.threshold
self.success = test_case.latency <= self.threshold
if self.success:
self.score = 1
else:
Expand Down Expand Up @@ -69,8 +69,8 @@ from deepeval.test_case import LLMTestCase
...


# Note that we pass in execution time since the measure method requires it for evaluation
# Note that we pass in latency since the measure method requires it for evaluation
latency_metric = LatencyMetric(max_seconds=10.0)
test_case = LLMTestCase(input="...", actual_output="...", execution_time=8.3)
test_case = LLMTestCase(input="...", actual_output="...", latency=8.3)
evaluate([test_case], [latency_metric])
```
9 changes: 9 additions & 0 deletions tests/test_cost.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from deepeval.metrics import CostMetric
from deepeval.test_case import LLMTestCase
from deepeval import assert_test


def test_cost_metric():
metric = CostMetric(threshold=12)
test_case = LLMTestCase(input="...", actual_output="...", cost=12)
assert_test(test_case, [metric])
4 changes: 2 additions & 2 deletions tests/test_custom_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, max_seconds: float = 10):

def measure(self, test_case: LLMTestCase):
# Set self.success and self.score in the "measure" method
self.success = test_case.execution_time <= self.threshold
self.success = test_case.latency <= self.threshold
if self.success:
self.score = 1
else:
Expand All @@ -37,6 +37,6 @@ def test_length_metric():
test_case = LLMTestCase(
input="placeholder",
actual_output="This is a long sentence that is more than 3 letters",
execution_time=8.3,
latency=8.3,
)
assert_test(test_case, [metric])
13 changes: 13 additions & 0 deletions tests/test_latency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from deepeval.metrics import LatencyMetric
from deepeval.test_case import LLMTestCase
from deepeval import assert_test


def test_latency_metric():
metric = LatencyMetric(threshold=12)
test_case = LLMTestCase(
input="...",
actual_output="...",
latency=8.3,
)
assert_test(test_case, [metric])

0 comments on commit e9a1045

Please sign in to comment.