DataDog · lievan · Dec 11, 2024 · Dec 11, 2024 · Dec 12, 2024 · Dec 12, 2024
@@ -0,0 +1,151 @@
+import math
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
+from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
+from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace
+
+
+logger = get_logger(__name__)
+
+
+class RagasAnswerRelevancyEvaluator(BaseRagasEvaluator):
+    """A class used by EvaluatorRunner to conduct ragas answer relevancy evaluations
+    on LLM Observability span events. The job of an Evaluator is to take a span and
+    submit evaluation metrics based on the span's attributes.
+    """
+
+    LABEL = "ragas_answer_relevancy"
+    METRIC_TYPE = "score"
+
+    def __init__(self, llmobs_service):
+        """
+        Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans.
+
+        answer relevancy focuses on assessing how pertinent the generated answer is to a given question.
+        A lower score is assigned to answers that are incomplete or contain redundant information and higher scores
+        indicate better relevancy. This metric is computed using the question, contexts, and answer.
+
+        For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/answer_relevance/
+
+        The `ragas.metrics.answer_relevancy` instance is used for answer relevancy scores.
+        If there is no llm attribute set on this instance, it will be set to the
+        default `llm_factory()` from ragas which uses openai.
+        If there is no embedding attribute set on this instance, it will be to to the
+        default `embedding_factory()` from ragas which uses openai
+
+        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
+                                      submitting evaluation metrics.
+
+        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
+        """
+        super().__init__(llmobs_service)
+        self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance()
+        self.answer_relevancy_output_parser = self.mini_ragas.RagasoutputParser(
+            pydantic_object=self.mini_ragas.AnswerRelevanceClassification
+        )
+
+    def _get_answer_relevancy_instance(self):
+        """
+        This helper function ensures the answer relevancy instance used in
+        ragas evaluator is updated with the latest ragas answer relevancy instance
+        instance AND has an non-null llm
+        """
+        if self.mini_ragas.answer_relevancy is None:
+            return None
+        ragas_answer_relevancy_instance = self.mini_ragas.answer_relevancy
+        if not ragas_answer_relevancy_instance.llm:
+            ragas_answer_relevancy_instance.llm = self.mini_ragas.llm_factory()
+        if not ragas_answer_relevancy_instance.embeddings:
+            ragas_answer_relevancy_instance.embeddings = self.mini_ragas.embedding_factory()
+        return ragas_answer_relevancy_instance
+
+    def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
+        """
+        Performs a answer relevancy evaluation on an llm span event, returning either
+            - answer relevancy score (float) OR failure reason (str)
+            - evaluation metadata (dict)
+        If the ragas answer relevancy instance does not have `llm` set, we set `llm` using the `llm_factory()`
+        method from ragas which currently defaults to openai's gpt-4o-turbo.
+        """
+        self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance()
+        if not self.ragas_answer_relevancy_instance:
+            return "fail_answer_relevancy_is_none", {}
+
+        evaluation_metadata = {}  # type: dict[str, Union[str, dict, list]]
+        trace_metadata = {}  # type: dict[str, Union[str, dict, list]]
+
+        # initialize data we annotate for tracing ragas
+        score, question, answer, answer_classifications = (
+            math.nan,
+            None,
+            None,
+            None,
+        )
+
+        with self.llmobs_service.workflow(
+            "dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event)
+        ) as ragas_ar_workflow:
+            try:
+                evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow)
+
+                cp_inputs = self._extract_evaluation_inputs_from_span(span_event)
+                if cp_inputs is None:
+                    logger.debug(
+                        "Failed to extract question and contexts from "
+                        "span sampled for `ragas_answer_relevancy` evaluation"
+                    )
+                    return "fail_extract_answer_relevancy_inputs", evaluation_metadata
+
+                question = cp_inputs["question"]
+                contexts = cp_inputs["contexts"]
+                answer = cp_inputs["answer"]
+
+                prompt = self.ragas_answer_relevancy_instance.question_generation.format(
+                    answer=answer,
+                    context="\n".join(contexts),
+                )
+
+                # 'strictness' is a parameter that can be set to control the number of generations
+                trace_metadata["strictness"] = self.ragas_answer_relevancy_instance.strictness
+                result = self.ragas_answer_relevancy_instance.llm.generate_text(
+                    prompt, n=self.ragas_answer_relevancy_instance.strictness
+                )
+
+                try:
+                    answers = [self.answer_relevancy_output_parser.parse(res.text) for res in result.generations[0]]
+                    answers = [answer for answer in answers if answer is not None]
+                except Exception as e:
+                    logger.debug("Failed to parse answer relevancy output: %s", e)
+                    return "fail_parse_answer_relevancy_output", evaluation_metadata
+
+                gen_questions = [answer.question for answer in answers]
+                answer_classifications = [
+                    {"question": answer.question, "noncommittal": answer.noncommittal} for answer in answers
+                ]
+                trace_metadata["answer_classifications"] = answer_classifications
+                if all(q == "" for q in gen_questions):
+                    logger.warning("Invalid JSON response. Expected dictionary with key 'question'")
+                    return "fail_parse_answer_relevancy_output", evaluation_metadata
+
+                # calculate cosine similarity between the question and generated questions
+                with self.llmobs_service.workflow("dd-ragas.calculate_similarity") as ragas_cs_workflow:
+                    cosine_sim = self.ragas_answer_relevancy_instance.calculate_similarity(question, gen_questions)
+                    self.llmobs_service.annotate(
+                        span=ragas_cs_workflow,
+                        input_data={"question": question, "generated_questions": gen_questions},
+                        output_data=cosine_sim.mean(),
+                    )
+
+                score = cosine_sim.mean() * int(not any(answer.noncommittal for answer in answers))
+                return score, evaluation_metadata
+            finally:
+                self.llmobs_service.annotate(
+                    span=ragas_ar_workflow,
+                    input_data=span_event,
+                    output_data=score,
+                    metadata=trace_metadata,
+                )
@@ -0,0 +1,159 @@
+import math
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+from ddtrace.internal.logger import get_logger
+from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
+from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
+from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
+from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace
+
+
+logger = get_logger(__name__)
+
+
+class RagasContextPrecisionEvaluator(BaseRagasEvaluator):
+    """A class used by EvaluatorRunner to conduct ragas context precision evaluations
+    on LLM Observability span events. The job of an Evaluator is to take a span and
+    submit evaluation metrics based on the span's attributes.
+    """
+
+    LABEL = "ragas_context_precision"
+    METRIC_TYPE = "score"
+
+    def __init__(self, llmobs_service):
+        """
+        Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans.
+
+        Context Precision is a metric that verifies if the context was useful in arriving at the given answer.
+        We compute this by dividing the number of relevant contexts by the total number of contexts.
+        Note that this is slightly modified from the original context precision metric in ragas, which computes
+        the mean of the precision @ rank k for each chunk in the context (where k is the number of
+        retrieved context chunks).
+
+        For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/
+
+        The `ragas.metrics.context_precision` instance is used for context precision scores.
+        If there is no llm attribute set on this instance, it will be set to the
+        default `llm_factory()` which uses openai.
+
+        :param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
+                                      submitting evaluation metrics.
+
+        Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
+        """
+        super().__init__(llmobs_service)
+        self.ragas_context_precision_instance = self._get_context_precision_instance()
+        self.context_precision_output_parser = self.mini_ragas.RagasoutputParser(
+            pydantic_object=self.mini_ragas.ContextPrecisionVerification
+        )
+
+    def _get_context_precision_instance(self):
+        """
+        This helper function ensures the context precision instance used in
+        ragas evaluator is updated with the latest ragas context precision instance
+        instance AND has an non-null llm
+        """
+        if self.mini_ragas.context_precision is None:
+            return None
+        ragas_context_precision_instance = self.mini_ragas.context_precision
+        if not ragas_context_precision_instance.llm:
+            ragas_context_precision_instance.llm = self.mini_ragas.llm_factory()
+        return ragas_context_precision_instance
+
+    def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
+        """
+        Performs a context precision evaluation on an llm span event, returning either
+            - context precision score (float) OR failure reason (str)
+            - evaluation metadata (dict)
+        If the ragas context precision instance does not have `llm` set, we set `llm` using the `llm_factory()`
+        method from ragas which currently defaults to openai's gpt-4o-turbo.
+        """
+        self.ragas_context_precision_instance = self._get_context_precision_instance()
+        if not self.ragas_context_precision_instance:
+            return "fail_context_precision_is_none", {}
+
+        evaluation_metadata = {EVALUATION_KIND_METADATA: "context_precision"}  # type: dict[str, Union[str, dict, list]]
+
+        # initialize data we annotate for tracing ragas
+        score, question, answer = (
+            math.nan,
+            None,
+            None,
+        )
+
+        with self.llmobs_service.workflow(
+            "dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event)
+        ) as ragas_cp_workflow:
+            try:
+                evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow)
+
+                cp_inputs = self._extract_evaluation_inputs_from_span(span_event)
+                if cp_inputs is None:
+                    logger.debug(
+                        "Failed to extract evaluation inputs from "
+                        "span sampled for `ragas_context_precision` evaluation"
+                    )
+                    return "fail_extract_context_precision_inputs", evaluation_metadata
+
+                question = cp_inputs["question"]
+                contexts = cp_inputs["contexts"]
+                answer = cp_inputs["answer"]
+
+                # create a prompt to evaluate the relevancy of each context chunk
+                ctx_precision_prompts = [
+                    self.ragas_context_precision_instance.context_precision_prompt.format(
+                        question=question, context=c, answer=answer
+                    )
+                    for c in contexts
+                ]
+
+                responses = []
+
+                for prompt in ctx_precision_prompts:
+                    result = self.ragas_context_precision_instance.llm.generate_text(prompt)
+                    reproducibility = getattr(self.ragas_context_precision_instance, "_reproducibility", 1)
+
+                    results = [result.generations[0][i].text for i in range(reproducibility)]
+                    try:
+                        responses.append(
+                            [
+                                res.dict()
+                                for res in [self.context_precision_output_parser.parse(text) for text in results]
+                                if res is not None
+                            ]
+                        )
+                    except Exception as e:
+                        logger.debug(
+                            "Failed to parse context precision verification for `ragas_context_precision`",
+                            exc_info=e,
+                        )
+                        return "fail_context_precision_parsing", evaluation_metadata
+
+                answers = []
+                for response in responses:
+                    agg_answer = self.mini_ragas.ensembler.from_discrete([response], "verdict")
+                    if agg_answer:
+                        try:
+                            agg_answer = self.mini_ragas.ContextPrecisionVerification.parse_obj(agg_answer[0])
+                        except Exception as e:
+                            logger.debug(
+                                "Failed to parse context precision verification for `ragas_context_precision`",
+                                exc_info=e,
+                            )
+                            return "fail_context_precision_parsing", evaluation_metadata
+                    answers.append(agg_answer)
+
+                if len(answers) == 0:
+                    return "fail_no_answers", evaluation_metadata
+
+                verdict_list = [1 if ver.verdict else 0 for ver in answers]
+                score = sum(verdict_list) / len(verdict_list)
+                return score, evaluation_metadata
+            finally:
+                self.llmobs_service.annotate(
+                    span=ragas_cp_workflow,
+                    input_data=span_event,
+                    output_data=score,
+                )
@@ -11,6 +11,18 @@
 """
 
 
+class AnswerRelevanceClassification(BaseModel):
+    question: str
+    noncommittal: int
+
+
+class ContextPrecisionVerification(BaseModel):
+    """Answer for the verification task whether the context was useful."""
+
+    reason: str = Field(..., description="Reason for verification")
+    verdict: int = Field(..., description="Binary (0/1) verdict of verification")
+
+
 class StatementFaithfulnessAnswer(BaseModel):
     statement: str = Field(..., description="the original statement, word-by-word")
     reason: str = Field(..., description="the reason of the verdict")

@@ -8,6 +8,8 @@
 from ddtrace.internal.periodic import PeriodicService
 from ddtrace.internal.telemetry import telemetry_writer
 from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
+from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
+from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
 from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
 from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler
 
@@ -16,7 +18,9 @@
 
 
 SUPPORTED_EVALUATORS = {
+    RagasAnswerRelevancyEvaluator.LABEL: RagasAnswerRelevancyEvaluator,
     RagasFaithfulnessEvaluator.LABEL: RagasFaithfulnessEvaluator,
+    RagasContextPrecisionEvaluator.LABEL: RagasContextPrecisionEvaluator,
 }
 
 
@@ -50,7 +54,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
             if evaluator in SUPPORTED_EVALUATORS:
                 evaluator_init_state = "ok"
                 try:
-                    self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service))
+                    self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service))  # noqa: E501
                 except NotImplementedError as e:
                     evaluator_init_state = "error"
                     raise e