feat(llmobs): support joining custom evaluations via tags (#11535)

This PR implements `LLMObs.submit_evaluation_for` method, which gives users two options for joining custom evaluations - by tag via the `span_with_tag` argument, which accepts a tuple containing a tag key/value pair - by span via the `span` argument, which accepts a dictionary containing `span_id` and `trace_id` keys There are also a couple behavior differences between `submit_evaluation_for` and `submit_evaluation`. In the new method, we - throw whenever a required argument is the wrong value or type - remove `metadata` argument - move the warning log for missing api key to the eval metric writer's `periodic` method Other changes: #### Eval metric writer Update the eval metric writer to write to the `v2` eval metric endpoint. The main difference with this endpoint is that it accepts a `join_with` field that holds joining information instead of a top-level trace and span id fields. #### Deprecate `submit_evaluation` Deprecates `submit_evaluation`. **I've set the removal version to be `3.0.0`.** ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: lievan <[email protected]>
DataDog · Jan 10, 2025 · 1b223aa · 1b223aa
1 parent 5e68823
commit 1b223aa
Show file tree

Hide file tree

Showing 15 changed files with 648 additions and 162 deletions.
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -28,6 +28,7 @@
 from ddtrace.internal.service import ServiceStatusError
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.internal.utils.deprecations import DDTraceDeprecationWarning
 from ddtrace.internal.utils.formats import asbool
 from ddtrace.internal.utils.formats import parse_tags_str
 from ddtrace.llmobs import _constants as constants
@@ -66,6 +67,7 @@
 from ddtrace.llmobs.utils import ExportedLLMObsSpan
 from ddtrace.llmobs.utils import Messages
 from ddtrace.propagation.http import HTTPPropagator
+from ddtrace.vendor.debtcollector import deprecate
 
 
 log = get_logger(__name__)
@@ -904,6 +906,127 @@ def _tag_metrics(span: Span, metrics: Dict[str, Any]) -> None:
             return
         span._set_ctx_item(METRICS, metrics)
 
+    @classmethod
+    def submit_evaluation_for(
+        cls,
+        label: str,
+        metric_type: str,
+        value: Union[str, int, float],
+        span: Optional[dict] = None,
+        span_with_tag_value: Optional[Dict[str, str]] = None,
+        tags: Optional[Dict[str, str]] = None,
+        ml_app: Optional[str] = None,
+        timestamp_ms: Optional[int] = None,
+    ) -> None:
+        """
+        Submits a custom evaluation metric for a given span.
+
+        :param str label: The name of the evaluation metric.
+        :param str metric_type: The type of the evaluation metric. One of "categorical", "score".
+        :param value: The value of the evaluation metric.
+                      Must be a string (categorical), integer (score), or float (score).
+        :param dict span: A dictionary of shape {'span_id': str, 'trace_id': str} uniquely identifying
+                            the span associated with this evaluation.
+        :param dict span_with_tag_value: A dictionary with the format {'tag_key': str, 'tag_value': str}
+                            uniquely identifying the span associated with this evaluation.
+        :param tags: A dictionary of string key-value pairs to tag the evaluation metric with.
+        :param str ml_app: The name of the ML application
+        :param int timestamp_ms: The unix timestamp in milliseconds when the evaluation metric result was generated.
+                                    If not set, the current time will be used.
+        """
+        if cls.enabled is False:
+            log.debug(
+                "LLMObs.submit_evaluation_for() called when LLMObs is not enabled. ",
+                "Evaluation metric data will not be sent.",
+            )
+            return
+
+        has_exactly_one_joining_key = (span is not None) ^ (span_with_tag_value is not None)
+
+        if not has_exactly_one_joining_key:
+            raise ValueError(
+                "Exactly one of `span` or `span_with_tag_value` must be specified to submit an evaluation metric."
+            )
+
+        join_on = {}
+        if span is not None:
+            if (
+                not isinstance(span, dict)
+                or not isinstance(span.get("span_id"), str)
+                or not isinstance(span.get("trace_id"), str)
+            ):
+                raise TypeError(
+                    "`span` must be a dictionary containing both span_id and trace_id keys. "
+                    "LLMObs.export_span() can be used to generate this dictionary from a given span."
+                )
+            join_on["span"] = span
+        elif span_with_tag_value is not None:
+            if (
+                not isinstance(span_with_tag_value, dict)
+                or not isinstance(span_with_tag_value.get("tag_key"), str)
+                or not isinstance(span_with_tag_value.get("tag_value"), str)
+            ):
+                raise TypeError(
+                    "`span_with_tag_value` must be a dict with keys 'tag_key' and 'tag_value' containing string values"
+                )
+            join_on["tag"] = {
+                "key": span_with_tag_value.get("tag_key"),
+                "value": span_with_tag_value.get("tag_value"),
+            }
+
+        timestamp_ms = timestamp_ms if timestamp_ms else int(time.time() * 1000)
+
+        if not isinstance(timestamp_ms, int) or timestamp_ms < 0:
+            raise ValueError("timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent")
+
+        if not label:
+            raise ValueError("label must be the specified name of the evaluation metric.")
+
+        metric_type = metric_type.lower()
+        if metric_type not in ("categorical", "score"):
+            raise ValueError("metric_type must be one of 'categorical' or 'score'.")
+
+        if metric_type == "categorical" and not isinstance(value, str):
+            raise TypeError("value must be a string for a categorical metric.")
+        if metric_type == "score" and not isinstance(value, (int, float)):
+            raise TypeError("value must be an integer or float for a score metric.")
+
+        if tags is not None and not isinstance(tags, dict):
+            log.warning("tags must be a dictionary of string key-value pairs.")
+            tags = {}
+
+        evaluation_tags = {
+            "ddtrace.version": ddtrace.__version__,
+            "ml_app": ml_app,
+        }
+
+        if tags:
+            for k, v in tags.items():
+                try:
+                    evaluation_tags[ensure_text(k)] = ensure_text(v)
+                except TypeError:
+                    log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
+
+        ml_app = ml_app if ml_app else config._llmobs_ml_app
+        if not ml_app:
+            log.warning(
+                "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. "
+                "Ensure this configuration is set before running your application."
+            )
+            return
+
+        evaluation_metric = {
+            "join_on": join_on,
+            "label": str(label),
+            "metric_type": metric_type,
+            "timestamp_ms": timestamp_ms,
+            "{}_value".format(metric_type): value,
+            "ml_app": ml_app,
+            "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
+        }
+
+        cls._instance._llmobs_eval_metric_writer.enqueue(evaluation_metric)
+
     @classmethod
     def submit_evaluation(
         cls,
@@ -916,6 +1039,13 @@ def submit_evaluation(
         timestamp_ms: Optional[int] = None,
         metadata: Optional[Dict[str, object]] = None,
     ) -> None:
+        deprecate(
+            "Using `LLMObs.submit_evaluation` is deprecated",
+            message="Please use `LLMObs.submit_evaluation_for` instead.",
+            removal_version="3.0.0",
+            category=DDTraceDeprecationWarning,
+        )
+
         """
         Submits a custom evaluation metric for a given span ID and trace ID.
 
@@ -931,7 +1061,7 @@ def submit_evaluation(
                                 evaluation metric.
         """
         if cls.enabled is False:
-            log.warning(
+            log.debug(
                 "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent."
             )
             return
@@ -1007,8 +1137,7 @@ def submit_evaluation(
                     log.warning("Failed to parse tags. Tags for evaluation metrics must be strings.")
 
         evaluation_metric = {
-            "span_id": span_id,
-            "trace_id": trace_id,
+            "join_on": {"span": {"span_id": span_id, "trace_id": trace_id}},
             "label": str(label),
             "metric_type": metric_type.lower(),
             "timestamp_ms": timestamp_ms,

diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py
@@ -55,8 +55,7 @@ class LLMObsSpanEvent(TypedDict):
 
 
 class LLMObsEvaluationMetricEvent(TypedDict, total=False):
-    span_id: str
-    trace_id: str
+    join_on: Dict[str, Dict[str, str]]
     metric_type: str
     label: str
     categorical_value: str
@@ -107,6 +106,13 @@ def periodic(self) -> None:
             events = self._buffer
             self._buffer = []
 
+        if not self._headers.get("DD-API-KEY"):
+            logger.warning(
+                "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. ",
+                "Ensure this configuration is set before running your application.",
+            )
+            return
+
         data = self._data(events)
         enc_llm_events = safe_json(data)
         conn = httplib.HTTPSConnection(self._intake, 443, timeout=self._timeout)
@@ -154,7 +160,7 @@ def __init__(self, site: str, api_key: str, interval: float, timeout: float) ->
         super(LLMObsEvalMetricWriter, self).__init__(site, api_key, interval, timeout)
         self._event_type = "evaluation_metric"
         self._buffer = []
-        self._endpoint = "/api/intake/llm-obs/v1/eval-metric"
+        self._endpoint = "/api/intake/llm-obs/v2/eval-metric"
         self._intake = "api.%s" % self._site  # type: str
 
     def enqueue(self, event: LLMObsEvaluationMetricEvent) -> None:

diff --git a/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml b/releasenotes/notes/submit-evaluation-for-01096d803d969e3e.yaml
@@ -0,0 +1,17 @@
+---
+features:
+  - |
+    LLM Observability: This introduces the `LLMObs.submit_evaluation_for` method, which provides the ability to join a custom evaluation 
+                        to a span using a tag key-value pair on the span. The tag key-value pair is expected to uniquely identify a single span. 
+                        Tag-based joining is an alternative to the existing method of joining evaluations to spans using trace and span IDs.
+                      Example usage:
+                        - Evaluation joined by tag: `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": "message_id", "tag_value": "dummy_message_id"}, label="rating", ...)`.
+                        - Evaluation joined by trace/span ID: `LLMObs.submit_evaluation_for(span={"trace_id": "...", "span_id": "..."}, label="rating", ...)`.
+deprecations:
+  - |
+    LLM Observability: `LLMObs.submit_evaluation` is deprecated and will be removed in ddtrace 3.0.0.
+                        As an alternative to `LLMObs.submit_evaluation`, you can use `LLMObs.submit_evaluation_for` instead.
+                        To migrate, replace `LLMObs.submit_evaluation(span_context={"span_id": ..., "trace_id": ...}, ...)` with:
+                          `LLMObs.submit_evaluation_for(span={"span_id": ..., "trace_id": ...}, ...)
+                        You may also join an evaluation to a span using a tag key-value pair like so:
+                          `LLMObs.submit_evaluation_for(span_with_tag_value={"tag_key": ..., "tag_val": ...}, ...)`.
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
@@ -210,11 +210,13 @@ def _get_llmobs_parent_id(span: Span):
 
 
 def _expected_llmobs_eval_metric_event(
-    span_id,
-    trace_id,
     metric_type,
     label,
     ml_app,
+    tag_key=None,
+    tag_value=None,
+    span_id=None,
+    trace_id=None,
     timestamp_ms=None,
     categorical_value=None,
     score_value=None,
@@ -223,15 +225,18 @@ def _expected_llmobs_eval_metric_event(
     metadata=None,
 ):
     eval_metric_event = {
-        "span_id": span_id,
-        "trace_id": trace_id,
+        "join_on": {},
         "metric_type": metric_type,
         "label": label,
         "tags": [
             "ddtrace.version:{}".format(ddtrace.__version__),
             "ml_app:{}".format(ml_app if ml_app is not None else "unnamed-ml-app"),
         ],
     }
+    if tag_key is not None and tag_value is not None:
+        eval_metric_event["join_on"]["tag"] = {"key": tag_key, "value": tag_value}
+    if span_id is not None and trace_id is not None:
+        eval_metric_event["join_on"]["span"] = {"span_id": span_id, "trace_id": trace_id}
     if categorical_value is not None:
         eval_metric_event["categorical_value"] = categorical_value
     if score_value is not None:
@@ -542,8 +547,7 @@ def run_and_submit_evaluation(self, span):
 
 def _dummy_evaluator_eval_metric_event(span_id, trace_id):
     return LLMObsEvaluationMetricEvent(
-        span_id=span_id,
-        trace_id=trace_id,
+        join_on={"span": {"span_id": span_id, "trace_id": trace_id}},
         score_value=1.0,
         ml_app="unnamed-ml-app",
         timestamp_ms=mock.ANY,

diff --git a/...lmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml b/...lmobs/llmobs_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.send_score_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678902", "trace_id": "98765432102", "metric_type": "score", "label": "sentiment",
-      "score_value": 0.9, "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500942}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678902", "trace_id": "98765432102"}}, "metric_type":
+      "score", "label": "sentiment", "score_value": 0.9, "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568298743}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"e66c93b9-ca0a-4f0a-9207-497e0a1b6eec","type":"evaluation_metric","attributes":{"metrics":[{"id":"5fb5ed5d-20c1-4f34-abf9-c0bdc09680e3","trace_id":"98765432102","span_id":"12345678902","timestamp_ms":1724249500942,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
+      string: '{"data":{"id":"5b998846-53af-4b0e-a658-fd9e06726d6d","type":"evaluation_metric","attributes":{"metrics":[{"id":"jbGbAMC7Rk","join_on":{"span":{"trace_id":"98765432102","span_id":"12345678902"}},"timestamp_ms":1732568298743,"ml_app":"dummy-ml-app","metric_type":"score","label":"sentiment","score_value":0.9}]}}}'
     headers:
       content-length:
-      - '316'
+      - '311'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:41 GMT
+      - Mon, 25 Nov 2024 20:58:19 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:

diff --git a/...s_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml b/...s_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_categorical_metric.yaml
@@ -1,27 +1,28 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500339}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297450}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
-      string: '{"data":{"id":"36d88c24-d7d4-4d3e-853c-b695aff61344","type":"evaluation_metric","attributes":{"metrics":[{"id":"0c189d9c-a730-4c5d-bbc2-55ef3455900f","trace_id":"98765432101","span_id":"12345678901","timestamp_ms":1724249500339,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
+      string: '{"data":{"id":"49c5c927-76f1-4de4-ad97-e1a0a159229f","type":"evaluation_metric","attributes":{"metrics":[{"id":"okVf1U4XzA","join_on":{"span":{"trace_id":"98765432101","span_id":"12345678901"}},"timestamp_ms":1732568297450,"ml_app":"dummy-ml-app","metric_type":"categorical","label":"toxicity","categorical_value":"very"}]}}}'
     headers:
       content-length:
-      - '330'
+      - '325'
       content-security-policy:
       - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pub293163a918901030b79492fe1ab424cf&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatad0g.com
       content-type:
       - application/vnd.api+json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       vary:

diff --git a/...s_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml b/...s_cassettes/tests.llmobs.test_llmobs_eval_metric_writer.test_send_metric_bad_api_key.yaml
@@ -1,15 +1,16 @@
 interactions:
 - request:
-    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"span_id":
-      "12345678901", "trace_id": "98765432101", "metric_type": "categorical", "categorical_value":
-      "very", "label": "toxicity", "ml_app": "dummy-ml-app", "timestamp_ms": 1724249500253}]}}}'
+    body: '{"data": {"type": "evaluation_metric", "attributes": {"metrics": [{"join_on":
+      {"span": {"span_id": "12345678901", "trace_id": "98765432101"}}, "metric_type":
+      "categorical", "categorical_value": "very", "label": "toxicity", "ml_app": "dummy-ml-app",
+      "timestamp_ms": 1732568297307}]}}}'
     headers:
       Content-Type:
       - application/json
       DD-API-KEY:
       - XXXXXX
     method: POST
-    uri: https://api.datad0g.com/api/intake/llm-obs/v1/eval-metric
+    uri: https://api.datad0g.com/api/intake/llm-obs/v2/eval-metric
   response:
     body:
       string: '{"status":"error","code":403,"errors":["Forbidden"],"statuspage":"http://status.datadoghq.com","twitter":"http://twitter.com/datadogops","email":"[email protected]"}'
@@ -21,7 +22,7 @@ interactions:
       content-type:
       - application/json
       date:
-      - Wed, 21 Aug 2024 14:11:40 GMT
+      - Mon, 25 Nov 2024 20:58:17 GMT
       strict-transport-security:
       - max-age=31536000; includeSubDomains; preload
       x-content-type-options: