Skip to content

Commit

Permalink
Merge branch 'main' into yifanmai/cross-provider-vertex-ai
Browse files Browse the repository at this point in the history
  • Loading branch information
elronbandel authored Jan 20, 2025
2 parents 660bb71 + 6dcf08e commit 75bebcb
Show file tree
Hide file tree
Showing 262 changed files with 12,503 additions and 724 deletions.
61 changes: 21 additions & 40 deletions examples/evaluate_existing_dataset_by_llm_as_judge_direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.text_utils import print_dict

logger = get_logger()
settings = get_settings()
Expand All @@ -16,15 +15,14 @@
metrics = [
"metrics.llm_as_judge.direct.rits.llama3_1_70b"
"[context_fields=[context,question],"
f"criteria=metrics.llm_as_judge.direct.criterias.{criteria},"
f"score_prefix={criteria}_]"
f"criteria=metrics.llm_as_judge.direct.criterias.{criteria}]"
for criteria in criterias
]
dataset = load_dataset(
card="cards.squad",
metrics=metrics,
loader_limit=10,
max_test_instances=10,
loader_limit=20,
max_test_instances=20,
split="test",
)

Expand All @@ -48,37 +46,20 @@
evaluated_predictions = evaluate(predictions=predictions, data=dataset)
evaluated_gold_answers = evaluate(predictions=gold_answers, data=dataset)

print_dict(
evaluated_predictions[0],
keys_to_print=[
"source",
"score",
],
)
print_dict(
evaluated_gold_answers[0],
keys_to_print=[
"source",
"score",
],
)

for criteria in criterias:
logger.info(f"Scores for criteria '{criteria}'")
gold_answer_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_gold_answers
instance["score"]["instance"][criteria] for instance in evaluated_gold_answers
]
gold_answer_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
instance["score"]["instance"][f"{criteria}_positional_bias"]
for instance in evaluated_gold_answers
]
prediction_scores = [
instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
for instance in evaluated_predictions
instance["score"]["instance"][criteria] for instance in evaluated_predictions
]
prediction_position_bias = [
int(instance["score"]["instance"][f"{criteria}_positional_bias"])
prediction_scores_position_bias = [
instance["score"]["instance"][f"{criteria}_positional_bias"]
for instance in evaluated_predictions
]

Expand All @@ -92,27 +73,27 @@
f"Positional bias occurrence on gold answers: {statistics.mean(gold_answer_position_bias)}"
)
logger.info(
f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_position_bias)}\n"
f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_scores_position_bias)}\n"
)

"""
Output with 100 examples
Output with 20 examples
Scores for criteria 'answer_relevance'
Scores of gold answers: 0.9625 +/- 0.14811526360619054
Scores of predicted answers: 0.5125 +/- 0.4638102516061385
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.12
Scores of gold answers: 0.8875 +/- 0.18978866362906205
Scores of predicted answers: 0.7625 +/- 0.3390679950439998
Positional bias occurrence on gold answers: 0.25
Positional bias occurrence on predicted answers: 0.25
Scores for criteria 'coherence'
Scores of gold answers: 0.159 +/- 0.15689216524464028
Scores of predicted answers: 0.066 +/- 0.11121005695384194
Positional bias occurrence on gold answers: 0.16
Positional bias occurrence on predicted answers: 0.07
Scores of gold answers: 0.8125 +/- 0.2910394257972982
Scores of predicted answers: 0.6875 +/- 0.39632356531129037
Positional bias occurrence on gold answers: 0.3
Positional bias occurrence on predicted answers: 0.3
Scores for criteria 'conciseness'
Scores of gold answers: 1.0 +/- 0.0
Scores of predicted answers: 0.34 +/- 0.47609522856952335
Positional bias occurrence on gold answers: 0.03
Positional bias occurrence on predicted answers: 0.01
Scores of predicted answers: 0.6 +/- 0.5026246899500346
Positional bias occurrence on gold answers: 0
Positional bias occurrence on predicted answers: 0.05
"""
89 changes: 89 additions & 0 deletions examples/evaluate_existing_dataset_by_llm_as_judge_pairwise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import json

from unitxt import get_logger, get_settings, load_dataset
from unitxt.api import evaluate
from unitxt.inference import (
CrossProviderInferenceEngine,
)
from unitxt.templates import NullTemplate

logger = get_logger()
settings = get_settings()

num_test_instances = 10

# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
# We set loader_limit to 20 to reduce download time.

dataset = load_dataset(
card="cards.squad",
loader_limit=num_test_instances,
max_test_instances=num_test_instances,
split="test",
)

# Infer a model to get predictions.
inference_model_1 = CrossProviderInferenceEngine(
model="llama-3-2-1b-instruct", provider="watsonx"
)

inference_model_2 = CrossProviderInferenceEngine(
model="llama-3-8b-instruct", provider="watsonx"
)

inference_model_3 = CrossProviderInferenceEngine(
model="llama-3-70b-instruct", provider="watsonx"
)

"""
We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
watsonx, bam, openai, azure, aws and more.
For the arguments these inference engines can receive, please refer to the classes documentation or read
about the the open ai api arguments the CrossProviderInferenceEngine follows.
"""
predictions_1 = inference_model_1.infer(dataset)
predictions_2 = inference_model_2.infer(dataset)
predictions_3 = inference_model_3.infer(dataset)

gold_answers = [d[0] for d in dataset["references"]]

# Evaluate the predictions using the defined metric.
predictions = [
list(t)
for t in list(zip(gold_answers, predictions_1, predictions_2, predictions_3))
]

print(json.dumps(predictions, indent=4))

criterias = ["factually_consistent"]
metrics = [
"metrics.llm_as_judge.pairwise.rits.llama3_1_405b"
f"[criteria=metrics.llm_as_judge.pairwise.criterias.{criteria},"
"context_fields=[context,question]]"
for criteria in criterias
]
dataset = load_dataset(
card="cards.squad",
loader_limit=num_test_instances,
max_test_instances=num_test_instances,
metrics=metrics,
template=NullTemplate(),
split="test",
)

evaluated_predictions = evaluate(predictions=predictions, data=dataset)

prediction_scores_by_system = {
f"system_{system}": {
"per_instance_winrate": [
instance["score"]["instance"][f"{system}_winrate"]
for instance in evaluated_predictions
],
"mean_winrate": evaluated_predictions[0]["score"]["global"][
f"{system}_winrate"
],
}
for system in range(1, len(predictions[0]) + 1)
}
print(json.dumps(prediction_scores_by_system, indent=4))
22 changes: 8 additions & 14 deletions examples/evaluate_external_rag_results_with_binary_llm_as_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,17 @@

# Select the desired metric(s).
# Each metric measures a certain aspect of the generated answer (answer_correctness, faithfulness,
# answer_relevance, context_relevance and correctness_holistic).
# All available metrics are under "catalog.metrics.rag"
# Those with extension "logprobs" provide a real value prediction in [0,1], the others provide a binary prediction.
# By default, all judges use llama_3_1_70b_instruct_wml. We will soon see how to change this.
# answer_relevance and context_relevance).
# All available metrics are under "catalog.metrics.rag.autorag.", ending with "judge"
# By default, all judges use llama_3_3_70b_instruct. We will soon see how to change this.
metric_names = [
"metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs",
"metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs",
"metrics.rag.autorag.answer_correctness.llama_3_3_70b_instruct_wml_judge",
"metrics.rag.autorag.faithfulness.llama_3_3_70b_instruct_wml_judge",
]

# select the desired model.
# all available models are under "catalog.engines.classification"
model_names = [
"engines.classification.mixtral_8x7b_instruct_v01_wml",
"engines.classification.llama_3_1_70b_instruct_wml",
# "engines.classification.gpt_4_turbo_openai",
]
model_names = ["engines.classification.mixtral_8x7b_instruct_v01_wml"]

if __name__ == "__main__":
multi_stream = MultiStream.from_iterables({"test": test_examples}, copying=True)
Expand All @@ -79,9 +74,8 @@

for metric_name in metric_names:
for model_name in model_names:
# override the metric with the inference model. the default model is llama_3_1_70b_instruct_wml so
# no need to override when using it.
llmaj_metric_name = f"{metric_name}[model={model_name}]"
# override the metric with the inference model (to use a model different from the one in the metric name)
llmaj_metric_name = f"{metric_name}[inference_model={model_name}]"

# apply the metric over the input
metrics_operator = SequentialOperator(steps=[llmaj_metric_name])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import CreateYesNoCriteriaFromString
from unitxt.llm_as_judge import CreateYesNoCriteriaFromString
from unitxt.loaders import LoadFromDictionary

data = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
print(results.instance_scores)
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,4 @@
print(results.global_scores.summary)

print("Instance Scores:")
print(results.instance_scores.summary)
print(results.instance_scores)
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import (
CreateCriteriaFromString,
)
from unitxt.llm_as_judge import CreateCriteriaFromString
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from unitxt import evaluate, load_dataset
from unitxt.blocks import Task, TaskCard
from unitxt.llm_as_judge_operators import LoadCriteria
from unitxt.llm_as_judge import LoadCriteria
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
from unitxt.api import evaluate, load_dataset
from unitxt.card import Task, TaskCard
from unitxt.inference import CrossProviderInferenceEngine
from unitxt.llm_as_judge import LLMJudgePairwise
from unitxt.llm_as_judge_operators import CreateCriteriaFromDict
from unitxt.llm_as_judge import CreateCriteriaFromDict, LLMJudgePairwise
from unitxt.loaders import LoadFromDictionary
from unitxt.templates import NullTemplate

Expand Down
10 changes: 10 additions & 0 deletions examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,21 @@
},
]

# select recommended metrics according to your available resources.
metrics = [
"metrics.rag.end_to_end.recommended.cpu_only.all",
# "metrics.rag.end_to_end.recommended.small_llm.all",
# "metrics.rag.end_to_end.recommended.llmaj_watsonx.all",
# "metrics.rag.end_to_end.recommended.llmaj_rits.all"
# "metrics.rag.end_to_end.recommended.llmaj_azure.all"
]

dataset = create_dataset(
task="tasks.rag.end_to_end",
test_set=dataset,
split="test",
postprocessors=[],
metrics=metrics,
)

results = evaluate(predictions, dataset)
Expand Down
10 changes: 10 additions & 0 deletions examples/evaluate_rag_response_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,23 @@
),
)

# select recommended metrics according to your available resources.
metrics = [
"metrics.rag.response_generation.recommended.cpu_only.all",
# "metrics.rag.response_generation.recommended.small_llm.all",
# "metrics.rag.response_generation.recommended.llmaj_watsonx.all",
# "metrics.rag.response_generation.recommended.llmaj_rits.all"
# "metrics.rag.response_generation.recommended.llmaj_azure.all"
]

# Verbalize the dataset using the template
dataset = load_dataset(
card=card,
template_card_index="simple",
format="formats.chat_api",
split="test",
max_test_instances=10,
metrics=metrics,
)


Expand Down
Loading

0 comments on commit 75bebcb

Please sign in to comment.