Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(llmobs): implement answer relevancy ragas metric #11738

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import math
from typing import Optional
from typing import Tuple
from typing import Union

from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace


logger = get_logger(__name__)


class RagasAnswerRelevancyEvaluator(BaseRagasEvaluator):
"""A class used by EvaluatorRunner to conduct ragas answer relevancy evaluations
on LLM Observability span events. The job of an Evaluator is to take a span and
submit evaluation metrics based on the span's attributes.
"""

LABEL = "ragas_answer_relevancy"
METRIC_TYPE = "score"

def __init__(self, llmobs_service):
"""
Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans.

answer relevancy focuses on assessing how pertinent the generated answer is to a given question.
A lower score is assigned to answers that are incomplete or contain redundant information and higher scores
indicate better relevancy. This metric is computed using the question, contexts, and answer.

For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/answer_relevance/

The `ragas.metrics.answer_relevancy` instance is used for answer relevancy scores.
If there is no llm attribute set on this instance, it will be set to the
default `llm_factory()` from ragas which uses openai.
If there is no embedding attribute set on this instance, it will be to to the
default `embedding_factory()` from ragas which uses openai

:param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
submitting evaluation metrics.

Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
"""
super().__init__(llmobs_service)
self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dumb question - what is the purpose of having a different LLM instance per eval metric runner? Are they not all references to the same base OpenAI() LLM?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each eval metric runner has access to one instance of a ragas metric (answer_relevancy, context_precision, faithfulness) each of these ragas metrics have a seperate llm attribute. We maintain a reference to the ragas metric, not the llm attribute

but yea, the default is openai for all of them

self.answer_relevancy_output_parser = self.mini_ragas.RagasoutputParser(
pydantic_object=self.mini_ragas.AnswerRelevanceClassification
)

def _get_answer_relevancy_instance(self):
"""
This helper function ensures the answer relevancy instance used in
ragas evaluator is updated with the latest ragas answer relevancy instance
instance AND has an non-null llm
"""
if self.mini_ragas.answer_relevancy is None:
return None
ragas_answer_relevancy_instance = self.mini_ragas.answer_relevancy
if not ragas_answer_relevancy_instance.llm:
ragas_answer_relevancy_instance.llm = self.mini_ragas.llm_factory()
if not ragas_answer_relevancy_instance.embeddings:
ragas_answer_relevancy_instance.embeddings = self.mini_ragas.embedding_factory()
return ragas_answer_relevancy_instance

def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
"""
Performs a answer relevancy evaluation on an llm span event, returning either
- answer relevancy score (float) OR failure reason (str)
- evaluation metadata (dict)
If the ragas answer relevancy instance does not have `llm` set, we set `llm` using the `llm_factory()`
method from ragas which currently defaults to openai's gpt-4o-turbo.
"""
self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance()
lievan marked this conversation as resolved.
Show resolved Hide resolved
if not self.ragas_answer_relevancy_instance:
return "fail_answer_relevancy_is_none", {}

evaluation_metadata = {} # type: dict[str, Union[str, dict, list]]
trace_metadata = {} # type: dict[str, Union[str, dict, list]]

# initialize data we annotate for tracing ragas
score, question, answer, answer_classifications = (
math.nan,
None,
None,
None,
lievan marked this conversation as resolved.
Show resolved Hide resolved
)

with self.llmobs_service.workflow(
"dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_ar_workflow:
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow)

cp_inputs = self._extract_evaluation_inputs_from_span(span_event)
if cp_inputs is None:
logger.debug(
"Failed to extract question and contexts from "
"span sampled for `ragas_answer_relevancy` evaluation"
)
return "fail_extract_answer_relevancy_inputs", evaluation_metadata

question = cp_inputs["question"]
contexts = cp_inputs["contexts"]
answer = cp_inputs["answer"]
Comment on lines +103 to +105
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as before, doesn't seem necessary to explicitly separate into new variables since they're only called once immediately at most


prompt = self.ragas_answer_relevancy_instance.question_generation.format(
answer=answer,
context="\n".join(contexts),
)

# 'strictness' is a parameter that can be set to control the number of generations
lievan marked this conversation as resolved.
Show resolved Hide resolved
trace_metadata["strictness"] = self.ragas_answer_relevancy_instance.strictness
result = self.ragas_answer_relevancy_instance.llm.generate_text(
prompt, n=self.ragas_answer_relevancy_instance.strictness
)

try:
answers = [self.answer_relevancy_output_parser.parse(res.text) for res in result.generations[0]]
answers = [answer for answer in answers if answer is not None]
except Exception as e:
logger.debug("Failed to parse answer relevancy output: %s", e)
return "fail_parse_answer_relevancy_output", evaluation_metadata

gen_questions = [answer.question for answer in answers]
answer_classifications = [
{"question": answer.question, "noncommittal": answer.noncommittal} for answer in answers
]
trace_metadata["answer_classifications"] = answer_classifications
if all(q == "" for q in gen_questions):
logger.warning("Invalid JSON response. Expected dictionary with key 'question'")
return "fail_parse_answer_relevancy_output", evaluation_metadata

# calculate cosine similarity between the question and generated questions
with self.llmobs_service.workflow("dd-ragas.calculate_similarity") as ragas_cs_workflow:
cosine_sim = self.ragas_answer_relevancy_instance.calculate_similarity(question, gen_questions)
self.llmobs_service.annotate(
span=ragas_cs_workflow,
input_data={"question": question, "generated_questions": gen_questions},
output_data=cosine_sim.mean(),
)

score = cosine_sim.mean() * int(not any(answer.noncommittal for answer in answers))
return score, evaluation_metadata
finally:
self.llmobs_service.annotate(
span=ragas_ar_workflow,
input_data=span_event,
output_data=score,
metadata=trace_metadata,
)
159 changes: 159 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/context_precision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import math
from typing import Optional
from typing import Tuple
from typing import Union

from ddtrace.internal.logger import get_logger
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace


logger = get_logger(__name__)


class RagasContextPrecisionEvaluator(BaseRagasEvaluator):
"""A class used by EvaluatorRunner to conduct ragas context precision evaluations
on LLM Observability span events. The job of an Evaluator is to take a span and
submit evaluation metrics based on the span's attributes.
"""

LABEL = "ragas_context_precision"
METRIC_TYPE = "score"

def __init__(self, llmobs_service):
"""
Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans.

Context Precision is a metric that verifies if the context was useful in arriving at the given answer.
We compute this by dividing the number of relevant contexts by the total number of contexts.
Note that this is slightly modified from the original context precision metric in ragas, which computes
the mean of the precision @ rank k for each chunk in the context (where k is the number of
retrieved context chunks).

For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/

The `ragas.metrics.context_precision` instance is used for context precision scores.
If there is no llm attribute set on this instance, it will be set to the
default `llm_factory()` which uses openai.

:param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and
submitting evaluation metrics.

Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported.
"""
super().__init__(llmobs_service)
self.ragas_context_precision_instance = self._get_context_precision_instance()
self.context_precision_output_parser = self.mini_ragas.RagasoutputParser(
pydantic_object=self.mini_ragas.ContextPrecisionVerification
)

def _get_context_precision_instance(self):
"""
This helper function ensures the context precision instance used in
ragas evaluator is updated with the latest ragas context precision instance
instance AND has an non-null llm
"""
if self.mini_ragas.context_precision is None:
return None
ragas_context_precision_instance = self.mini_ragas.context_precision
if not ragas_context_precision_instance.llm:
ragas_context_precision_instance.llm = self.mini_ragas.llm_factory()
return ragas_context_precision_instance

def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]:
"""
Performs a context precision evaluation on an llm span event, returning either
- context precision score (float) OR failure reason (str)
- evaluation metadata (dict)
If the ragas context precision instance does not have `llm` set, we set `llm` using the `llm_factory()`
method from ragas which currently defaults to openai's gpt-4o-turbo.
"""
self.ragas_context_precision_instance = self._get_context_precision_instance()
if not self.ragas_context_precision_instance:
return "fail_context_precision_is_none", {}

evaluation_metadata = {EVALUATION_KIND_METADATA: "context_precision"} # type: dict[str, Union[str, dict, list]]

# initialize data we annotate for tracing ragas
score, question, answer = (
math.nan,
None,
None,
)

with self.llmobs_service.workflow(
"dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event)
) as ragas_cp_workflow:
try:
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow)

cp_inputs = self._extract_evaluation_inputs_from_span(span_event)
if cp_inputs is None:
logger.debug(
"Failed to extract evaluation inputs from "
"span sampled for `ragas_context_precision` evaluation"
)
return "fail_extract_context_precision_inputs", evaluation_metadata

question = cp_inputs["question"]
contexts = cp_inputs["contexts"]
answer = cp_inputs["answer"]

# create a prompt to evaluate the relevancy of each context chunk
ctx_precision_prompts = [
self.ragas_context_precision_instance.context_precision_prompt.format(
question=question, context=c, answer=answer
)
for c in contexts
]

responses = []

for prompt in ctx_precision_prompts:
result = self.ragas_context_precision_instance.llm.generate_text(prompt)
reproducibility = getattr(self.ragas_context_precision_instance, "_reproducibility", 1)

results = [result.generations[0][i].text for i in range(reproducibility)]
try:
responses.append(
[
res.dict()
for res in [self.context_precision_output_parser.parse(text) for text in results]
if res is not None
]
)
except Exception as e:
logger.debug(
"Failed to parse context precision verification for `ragas_context_precision`",
exc_info=e,
)
return "fail_context_precision_parsing", evaluation_metadata

answers = []
for response in responses:
agg_answer = self.mini_ragas.ensembler.from_discrete([response], "verdict")
if agg_answer:
try:
agg_answer = self.mini_ragas.ContextPrecisionVerification.parse_obj(agg_answer[0])
except Exception as e:
logger.debug(
"Failed to parse context precision verification for `ragas_context_precision`",
exc_info=e,
)
return "fail_context_precision_parsing", evaluation_metadata
answers.append(agg_answer)

if len(answers) == 0:
return "fail_no_answers", evaluation_metadata

verdict_list = [1 if ver.verdict else 0 for ver in answers]
score = sum(verdict_list) / len(verdict_list)
return score, evaluation_metadata
finally:
self.llmobs_service.annotate(
span=ragas_cp_workflow,
input_data=span_event,
output_data=score,
)
12 changes: 12 additions & 0 deletions ddtrace/llmobs/_evaluators/ragas/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,18 @@
"""


class AnswerRelevanceClassification(BaseModel):
question: str
noncommittal: int


class ContextPrecisionVerification(BaseModel):
"""Answer for the verification task whether the context was useful."""

reason: str = Field(..., description="Reason for verification")
verdict: int = Field(..., description="Binary (0/1) verdict of verification")


class StatementFaithfulnessAnswer(BaseModel):
statement: str = Field(..., description="the original statement, word-by-word")
reason: str = Field(..., description="the reason of the verdict")
Expand Down
6 changes: 5 additions & 1 deletion ddtrace/llmobs/_evaluators/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from ddtrace.internal.periodic import PeriodicService
from ddtrace.internal.telemetry import telemetry_writer
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler

Expand All @@ -16,7 +18,9 @@


SUPPORTED_EVALUATORS = {
RagasAnswerRelevancyEvaluator.LABEL: RagasAnswerRelevancyEvaluator,
RagasFaithfulnessEvaluator.LABEL: RagasFaithfulnessEvaluator,
RagasContextPrecisionEvaluator.LABEL: RagasContextPrecisionEvaluator,
}


Expand Down Expand Up @@ -50,7 +54,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
if evaluator in SUPPORTED_EVALUATORS:
evaluator_init_state = "ok"
try:
self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service))
self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service)) # noqa: E501
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this fmt comment required?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there was a error being raised by mypy due to the use of ABC. I think it was a bug since it only appeared when we have 3 or more evaluators (see: python/mypy#13044). But we are not using an abstract class for BaseRagasEvaluator anyway, so no need for this anymore

except NotImplementedError as e:
evaluator_init_state = "error"
raise e
Expand Down
Loading
Loading