-
Notifications
You must be signed in to change notification settings - Fork 420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore(llmobs): implement answer relevancy ragas metric #11738
Changes from all commits
94dfff1
72b71f0
4d57d83
06dbc55
cd31162
dfd6c57
dd1abb4
c477f2b
d2f5f29
9297b7d
31f89c7
9b19dab
2942272
dbbc046
8e4a452
244bba9
d246bc5
956e32d
a217688
812a162
4c53378
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import math | ||
from typing import Optional | ||
from typing import Tuple | ||
from typing import Union | ||
|
||
from ddtrace.internal.logger import get_logger | ||
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA | ||
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator | ||
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace | ||
|
||
|
||
logger = get_logger(__name__) | ||
|
||
|
||
class RagasAnswerRelevancyEvaluator(BaseRagasEvaluator): | ||
"""A class used by EvaluatorRunner to conduct ragas answer relevancy evaluations | ||
on LLM Observability span events. The job of an Evaluator is to take a span and | ||
submit evaluation metrics based on the span's attributes. | ||
""" | ||
|
||
LABEL = "ragas_answer_relevancy" | ||
METRIC_TYPE = "score" | ||
|
||
def __init__(self, llmobs_service): | ||
""" | ||
Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans. | ||
|
||
answer relevancy focuses on assessing how pertinent the generated answer is to a given question. | ||
A lower score is assigned to answers that are incomplete or contain redundant information and higher scores | ||
indicate better relevancy. This metric is computed using the question, contexts, and answer. | ||
|
||
For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/answer_relevance/ | ||
|
||
The `ragas.metrics.answer_relevancy` instance is used for answer relevancy scores. | ||
If there is no llm attribute set on this instance, it will be set to the | ||
default `llm_factory()` from ragas which uses openai. | ||
If there is no embedding attribute set on this instance, it will be to to the | ||
default `embedding_factory()` from ragas which uses openai | ||
|
||
:param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and | ||
submitting evaluation metrics. | ||
|
||
Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. | ||
""" | ||
super().__init__(llmobs_service) | ||
self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance() | ||
self.answer_relevancy_output_parser = self.mini_ragas.RagasoutputParser( | ||
pydantic_object=self.mini_ragas.AnswerRelevanceClassification | ||
) | ||
|
||
def _get_answer_relevancy_instance(self): | ||
""" | ||
This helper function ensures the answer relevancy instance used in | ||
ragas evaluator is updated with the latest ragas answer relevancy instance | ||
instance AND has an non-null llm | ||
""" | ||
if self.mini_ragas.answer_relevancy is None: | ||
return None | ||
ragas_answer_relevancy_instance = self.mini_ragas.answer_relevancy | ||
if not ragas_answer_relevancy_instance.llm: | ||
ragas_answer_relevancy_instance.llm = self.mini_ragas.llm_factory() | ||
if not ragas_answer_relevancy_instance.embeddings: | ||
ragas_answer_relevancy_instance.embeddings = self.mini_ragas.embedding_factory() | ||
return ragas_answer_relevancy_instance | ||
|
||
def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: | ||
""" | ||
Performs a answer relevancy evaluation on an llm span event, returning either | ||
- answer relevancy score (float) OR failure reason (str) | ||
- evaluation metadata (dict) | ||
If the ragas answer relevancy instance does not have `llm` set, we set `llm` using the `llm_factory()` | ||
method from ragas which currently defaults to openai's gpt-4o-turbo. | ||
""" | ||
self.ragas_answer_relevancy_instance = self._get_answer_relevancy_instance() | ||
lievan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not self.ragas_answer_relevancy_instance: | ||
return "fail_answer_relevancy_is_none", {} | ||
|
||
evaluation_metadata = {} # type: dict[str, Union[str, dict, list]] | ||
trace_metadata = {} # type: dict[str, Union[str, dict, list]] | ||
|
||
# initialize data we annotate for tracing ragas | ||
score, question, answer, answer_classifications = ( | ||
math.nan, | ||
None, | ||
None, | ||
None, | ||
lievan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
with self.llmobs_service.workflow( | ||
"dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event) | ||
) as ragas_ar_workflow: | ||
try: | ||
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow) | ||
|
||
cp_inputs = self._extract_evaluation_inputs_from_span(span_event) | ||
if cp_inputs is None: | ||
logger.debug( | ||
"Failed to extract question and contexts from " | ||
"span sampled for `ragas_answer_relevancy` evaluation" | ||
) | ||
return "fail_extract_answer_relevancy_inputs", evaluation_metadata | ||
|
||
question = cp_inputs["question"] | ||
contexts = cp_inputs["contexts"] | ||
answer = cp_inputs["answer"] | ||
Comment on lines
+103
to
+105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as before, doesn't seem necessary to explicitly separate into new variables since they're only called once immediately at most |
||
|
||
prompt = self.ragas_answer_relevancy_instance.question_generation.format( | ||
answer=answer, | ||
context="\n".join(contexts), | ||
) | ||
|
||
# 'strictness' is a parameter that can be set to control the number of generations | ||
lievan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
trace_metadata["strictness"] = self.ragas_answer_relevancy_instance.strictness | ||
result = self.ragas_answer_relevancy_instance.llm.generate_text( | ||
prompt, n=self.ragas_answer_relevancy_instance.strictness | ||
) | ||
|
||
try: | ||
answers = [self.answer_relevancy_output_parser.parse(res.text) for res in result.generations[0]] | ||
answers = [answer for answer in answers if answer is not None] | ||
except Exception as e: | ||
logger.debug("Failed to parse answer relevancy output: %s", e) | ||
return "fail_parse_answer_relevancy_output", evaluation_metadata | ||
|
||
gen_questions = [answer.question for answer in answers] | ||
answer_classifications = [ | ||
{"question": answer.question, "noncommittal": answer.noncommittal} for answer in answers | ||
] | ||
trace_metadata["answer_classifications"] = answer_classifications | ||
if all(q == "" for q in gen_questions): | ||
logger.warning("Invalid JSON response. Expected dictionary with key 'question'") | ||
return "fail_parse_answer_relevancy_output", evaluation_metadata | ||
|
||
# calculate cosine similarity between the question and generated questions | ||
with self.llmobs_service.workflow("dd-ragas.calculate_similarity") as ragas_cs_workflow: | ||
cosine_sim = self.ragas_answer_relevancy_instance.calculate_similarity(question, gen_questions) | ||
self.llmobs_service.annotate( | ||
span=ragas_cs_workflow, | ||
input_data={"question": question, "generated_questions": gen_questions}, | ||
output_data=cosine_sim.mean(), | ||
) | ||
|
||
score = cosine_sim.mean() * int(not any(answer.noncommittal for answer in answers)) | ||
return score, evaluation_metadata | ||
finally: | ||
self.llmobs_service.annotate( | ||
span=ragas_ar_workflow, | ||
input_data=span_event, | ||
output_data=score, | ||
metadata=trace_metadata, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
import math | ||
from typing import Optional | ||
from typing import Tuple | ||
from typing import Union | ||
|
||
from ddtrace.internal.logger import get_logger | ||
from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA | ||
from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA | ||
from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator | ||
from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace | ||
|
||
|
||
logger = get_logger(__name__) | ||
|
||
|
||
class RagasContextPrecisionEvaluator(BaseRagasEvaluator): | ||
"""A class used by EvaluatorRunner to conduct ragas context precision evaluations | ||
on LLM Observability span events. The job of an Evaluator is to take a span and | ||
submit evaluation metrics based on the span's attributes. | ||
""" | ||
|
||
LABEL = "ragas_context_precision" | ||
METRIC_TYPE = "score" | ||
|
||
def __init__(self, llmobs_service): | ||
""" | ||
Initialize an evaluator that uses the ragas library to generate a context precision score on finished LLM spans. | ||
|
||
Context Precision is a metric that verifies if the context was useful in arriving at the given answer. | ||
We compute this by dividing the number of relevant contexts by the total number of contexts. | ||
Note that this is slightly modified from the original context precision metric in ragas, which computes | ||
the mean of the precision @ rank k for each chunk in the context (where k is the number of | ||
retrieved context chunks). | ||
|
||
For more information, see https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/ | ||
|
||
The `ragas.metrics.context_precision` instance is used for context precision scores. | ||
If there is no llm attribute set on this instance, it will be set to the | ||
default `llm_factory()` which uses openai. | ||
|
||
:param llmobs_service: An instance of the LLM Observability service used for tracing the evaluation and | ||
submitting evaluation metrics. | ||
|
||
Raises: NotImplementedError if the ragas library is not found or if ragas version is not supported. | ||
""" | ||
super().__init__(llmobs_service) | ||
self.ragas_context_precision_instance = self._get_context_precision_instance() | ||
self.context_precision_output_parser = self.mini_ragas.RagasoutputParser( | ||
pydantic_object=self.mini_ragas.ContextPrecisionVerification | ||
) | ||
|
||
def _get_context_precision_instance(self): | ||
""" | ||
This helper function ensures the context precision instance used in | ||
ragas evaluator is updated with the latest ragas context precision instance | ||
instance AND has an non-null llm | ||
""" | ||
if self.mini_ragas.context_precision is None: | ||
return None | ||
ragas_context_precision_instance = self.mini_ragas.context_precision | ||
if not ragas_context_precision_instance.llm: | ||
ragas_context_precision_instance.llm = self.mini_ragas.llm_factory() | ||
return ragas_context_precision_instance | ||
|
||
def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]]: | ||
""" | ||
Performs a context precision evaluation on an llm span event, returning either | ||
- context precision score (float) OR failure reason (str) | ||
- evaluation metadata (dict) | ||
If the ragas context precision instance does not have `llm` set, we set `llm` using the `llm_factory()` | ||
method from ragas which currently defaults to openai's gpt-4o-turbo. | ||
""" | ||
self.ragas_context_precision_instance = self._get_context_precision_instance() | ||
if not self.ragas_context_precision_instance: | ||
return "fail_context_precision_is_none", {} | ||
|
||
evaluation_metadata = {EVALUATION_KIND_METADATA: "context_precision"} # type: dict[str, Union[str, dict, list]] | ||
|
||
# initialize data we annotate for tracing ragas | ||
score, question, answer = ( | ||
math.nan, | ||
None, | ||
None, | ||
) | ||
|
||
with self.llmobs_service.workflow( | ||
"dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event) | ||
) as ragas_cp_workflow: | ||
try: | ||
evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow) | ||
|
||
cp_inputs = self._extract_evaluation_inputs_from_span(span_event) | ||
if cp_inputs is None: | ||
logger.debug( | ||
"Failed to extract evaluation inputs from " | ||
"span sampled for `ragas_context_precision` evaluation" | ||
) | ||
return "fail_extract_context_precision_inputs", evaluation_metadata | ||
|
||
question = cp_inputs["question"] | ||
contexts = cp_inputs["contexts"] | ||
answer = cp_inputs["answer"] | ||
|
||
# create a prompt to evaluate the relevancy of each context chunk | ||
ctx_precision_prompts = [ | ||
self.ragas_context_precision_instance.context_precision_prompt.format( | ||
question=question, context=c, answer=answer | ||
) | ||
for c in contexts | ||
] | ||
|
||
responses = [] | ||
|
||
for prompt in ctx_precision_prompts: | ||
result = self.ragas_context_precision_instance.llm.generate_text(prompt) | ||
reproducibility = getattr(self.ragas_context_precision_instance, "_reproducibility", 1) | ||
|
||
results = [result.generations[0][i].text for i in range(reproducibility)] | ||
try: | ||
responses.append( | ||
[ | ||
res.dict() | ||
for res in [self.context_precision_output_parser.parse(text) for text in results] | ||
if res is not None | ||
] | ||
) | ||
except Exception as e: | ||
logger.debug( | ||
"Failed to parse context precision verification for `ragas_context_precision`", | ||
exc_info=e, | ||
) | ||
return "fail_context_precision_parsing", evaluation_metadata | ||
|
||
answers = [] | ||
for response in responses: | ||
agg_answer = self.mini_ragas.ensembler.from_discrete([response], "verdict") | ||
if agg_answer: | ||
try: | ||
agg_answer = self.mini_ragas.ContextPrecisionVerification.parse_obj(agg_answer[0]) | ||
except Exception as e: | ||
logger.debug( | ||
"Failed to parse context precision verification for `ragas_context_precision`", | ||
exc_info=e, | ||
) | ||
return "fail_context_precision_parsing", evaluation_metadata | ||
answers.append(agg_answer) | ||
|
||
if len(answers) == 0: | ||
return "fail_no_answers", evaluation_metadata | ||
|
||
verdict_list = [1 if ver.verdict else 0 for ver in answers] | ||
score = sum(verdict_list) / len(verdict_list) | ||
return score, evaluation_metadata | ||
finally: | ||
self.llmobs_service.annotate( | ||
span=ragas_cp_workflow, | ||
input_data=span_event, | ||
output_data=score, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ | |
from ddtrace.internal.periodic import PeriodicService | ||
from ddtrace.internal.telemetry import telemetry_writer | ||
from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT | ||
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator | ||
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator | ||
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator | ||
from ddtrace.llmobs._evaluators.sampler import EvaluatorRunnerSampler | ||
|
||
|
@@ -16,7 +18,9 @@ | |
|
||
|
||
SUPPORTED_EVALUATORS = { | ||
RagasAnswerRelevancyEvaluator.LABEL: RagasAnswerRelevancyEvaluator, | ||
RagasFaithfulnessEvaluator.LABEL: RagasFaithfulnessEvaluator, | ||
RagasContextPrecisionEvaluator.LABEL: RagasContextPrecisionEvaluator, | ||
} | ||
|
||
|
||
|
@@ -50,7 +54,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None): | |
if evaluator in SUPPORTED_EVALUATORS: | ||
evaluator_init_state = "ok" | ||
try: | ||
self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service)) | ||
self.evaluators.append(SUPPORTED_EVALUATORS[evaluator](llmobs_service=llmobs_service)) # noqa: E501 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this fmt comment required? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there was a error being raised by mypy due to the use of |
||
except NotImplementedError as e: | ||
evaluator_init_state = "error" | ||
raise e | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
dumb question - what is the purpose of having a different LLM instance per eval metric runner? Are they not all references to the same base OpenAI() LLM?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Each eval metric runner has access to one instance of a ragas metric (answer_relevancy, context_precision, faithfulness) each of these ragas metrics have a seperate
llm
attribute. We maintain a reference to the ragas metric, not the llm attributebut yea, the default is openai for all of them