Merge branch 'main' into yifanmai/cross-provider-vertex-ai

IBM · Jan 20, 2025 · 75bebcb · 75bebcb
2 parents 660bb71 + 6dcf08e
commit 75bebcb
Show file tree

Hide file tree

Showing 262 changed files with 12,503 additions and 724 deletions.
diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge_direct.py b/examples/evaluate_existing_dataset_by_llm_as_judge_direct.py
@@ -5,7 +5,6 @@
 from unitxt.inference import (
     CrossProviderInferenceEngine,
 )
-from unitxt.text_utils import print_dict
 
 logger = get_logger()
 settings = get_settings()
@@ -16,15 +15,14 @@
 metrics = [
     "metrics.llm_as_judge.direct.rits.llama3_1_70b"
     "[context_fields=[context,question],"
-    f"criteria=metrics.llm_as_judge.direct.criterias.{criteria},"
-    f"score_prefix={criteria}_]"
+    f"criteria=metrics.llm_as_judge.direct.criterias.{criteria}]"
     for criteria in criterias
 ]
 dataset = load_dataset(
     card="cards.squad",
     metrics=metrics,
-    loader_limit=10,
-    max_test_instances=10,
+    loader_limit=20,
+    max_test_instances=20,
     split="test",
 )
 
@@ -48,37 +46,20 @@
 evaluated_predictions = evaluate(predictions=predictions, data=dataset)
 evaluated_gold_answers = evaluate(predictions=gold_answers, data=dataset)
 
-print_dict(
-    evaluated_predictions[0],
-    keys_to_print=[
-        "source",
-        "score",
-    ],
-)
-print_dict(
-    evaluated_gold_answers[0],
-    keys_to_print=[
-        "source",
-        "score",
-    ],
-)
-
 for criteria in criterias:
     logger.info(f"Scores for criteria '{criteria}'")
     gold_answer_scores = [
-        instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
-        for instance in evaluated_gold_answers
+        instance["score"]["instance"][criteria] for instance in evaluated_gold_answers
     ]
     gold_answer_position_bias = [
-        int(instance["score"]["instance"][f"{criteria}_positional_bias"])
+        instance["score"]["instance"][f"{criteria}_positional_bias"]
         for instance in evaluated_gold_answers
     ]
     prediction_scores = [
-        instance["score"]["instance"][f"{criteria}_llm_as_a_judge_score"]
-        for instance in evaluated_predictions
+        instance["score"]["instance"][criteria] for instance in evaluated_predictions
     ]
-    prediction_position_bias = [
-        int(instance["score"]["instance"][f"{criteria}_positional_bias"])
+    prediction_scores_position_bias = [
+        instance["score"]["instance"][f"{criteria}_positional_bias"]
         for instance in evaluated_predictions
     ]
 
@@ -92,27 +73,27 @@
         f"Positional bias occurrence on gold answers: {statistics.mean(gold_answer_position_bias)}"
     )
     logger.info(
-        f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_position_bias)}\n"
+        f"Positional bias occurrence on predicted answers: {statistics.mean(prediction_scores_position_bias)}\n"
     )
 
 """
-Output with 100 examples
+Output with 20 examples
 
 Scores for criteria 'answer_relevance'
-Scores of gold answers: 0.9625 +/- 0.14811526360619054
-Scores of predicted answers: 0.5125 +/- 0.4638102516061385
-Positional bias occurrence on gold answers: 0.03
-Positional bias occurrence on predicted answers: 0.12
+Scores of gold answers: 0.8875 +/- 0.18978866362906205
+Scores of predicted answers: 0.7625 +/- 0.3390679950439998
+Positional bias occurrence on gold answers: 0.25
+Positional bias occurrence on predicted answers: 0.25
 
 Scores for criteria 'coherence'
-Scores of gold answers: 0.159 +/- 0.15689216524464028
-Scores of predicted answers: 0.066 +/- 0.11121005695384194
-Positional bias occurrence on gold answers: 0.16
-Positional bias occurrence on predicted answers: 0.07
+Scores of gold answers: 0.8125 +/- 0.2910394257972982
+Scores of predicted answers: 0.6875 +/- 0.39632356531129037
+Positional bias occurrence on gold answers: 0.3
+Positional bias occurrence on predicted answers: 0.3
 
 Scores for criteria 'conciseness'
 Scores of gold answers: 1.0 +/- 0.0
-Scores of predicted answers: 0.34 +/- 0.47609522856952335
-Positional bias occurrence on gold answers: 0.03
-Positional bias occurrence on predicted answers: 0.01
+Scores of predicted answers: 0.6 +/- 0.5026246899500346
+Positional bias occurrence on gold answers: 0
+Positional bias occurrence on predicted answers: 0.05
 """
diff --git a/examples/evaluate_existing_dataset_by_llm_as_judge_pairwise.py b/examples/evaluate_existing_dataset_by_llm_as_judge_pairwise.py
@@ -0,0 +1,89 @@
+import json
+
+from unitxt import get_logger, get_settings, load_dataset
+from unitxt.api import evaluate
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+)
+from unitxt.templates import NullTemplate
+
+logger = get_logger()
+settings = get_settings()
+
+num_test_instances = 10
+
+# Use the HF load_dataset API, to load the squad QA dataset using the standard template in the catalog.
+# We set loader_limit to 20 to reduce download time.
+
+dataset = load_dataset(
+    card="cards.squad",
+    loader_limit=num_test_instances,
+    max_test_instances=num_test_instances,
+    split="test",
+)
+
+# Infer a model to get predictions.
+inference_model_1 = CrossProviderInferenceEngine(
+    model="llama-3-2-1b-instruct", provider="watsonx"
+)
+
+inference_model_2 = CrossProviderInferenceEngine(
+    model="llama-3-8b-instruct", provider="watsonx"
+)
+
+inference_model_3 = CrossProviderInferenceEngine(
+    model="llama-3-70b-instruct", provider="watsonx"
+)
+
+"""
+We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
+watsonx, bam, openai, azure, aws and more.
+
+For the arguments these inference engines can receive, please refer to the classes documentation or read
+about the the open ai api arguments the CrossProviderInferenceEngine follows.
+"""
+predictions_1 = inference_model_1.infer(dataset)
+predictions_2 = inference_model_2.infer(dataset)
+predictions_3 = inference_model_3.infer(dataset)
+
+gold_answers = [d[0] for d in dataset["references"]]
+
+# Evaluate the predictions using the defined metric.
+predictions = [
+    list(t)
+    for t in list(zip(gold_answers, predictions_1, predictions_2, predictions_3))
+]
+
+print(json.dumps(predictions, indent=4))
+
+criterias = ["factually_consistent"]
+metrics = [
+    "metrics.llm_as_judge.pairwise.rits.llama3_1_405b"
+    f"[criteria=metrics.llm_as_judge.pairwise.criterias.{criteria},"
+    "context_fields=[context,question]]"
+    for criteria in criterias
+]
+dataset = load_dataset(
+    card="cards.squad",
+    loader_limit=num_test_instances,
+    max_test_instances=num_test_instances,
+    metrics=metrics,
+    template=NullTemplate(),
+    split="test",
+)
+
+evaluated_predictions = evaluate(predictions=predictions, data=dataset)
+
+prediction_scores_by_system = {
+    f"system_{system}": {
+        "per_instance_winrate": [
+            instance["score"]["instance"][f"{system}_winrate"]
+            for instance in evaluated_predictions
+        ],
+        "mean_winrate": evaluated_predictions[0]["score"]["global"][
+            f"{system}_winrate"
+        ],
+    }
+    for system in range(1, len(predictions[0]) + 1)
+}
+print(json.dumps(prediction_scores_by_system, indent=4))
diff --git a/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py b/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py
@@ -53,22 +53,17 @@
 
 # Select the desired metric(s).
 # Each metric measures a certain aspect of the generated answer (answer_correctness, faithfulness,
-# answer_relevance, context_relevance and correctness_holistic).
-# All available metrics are under "catalog.metrics.rag"
-# Those with extension "logprobs" provide a real value prediction in [0,1], the others provide a binary prediction.
-# By default, all judges use llama_3_1_70b_instruct_wml. We will soon see how to change this.
+# answer_relevance and context_relevance).
+# All available metrics are under "catalog.metrics.rag.autorag.", ending with "judge"
+# By default, all judges use llama_3_3_70b_instruct. We will soon see how to change this.
 metric_names = [
-    "metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs",
-    "metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs",
+    "metrics.rag.autorag.answer_correctness.llama_3_3_70b_instruct_wml_judge",
+    "metrics.rag.autorag.faithfulness.llama_3_3_70b_instruct_wml_judge",
 ]
 
 # select the desired model.
 # all available models are under "catalog.engines.classification"
-model_names = [
-    "engines.classification.mixtral_8x7b_instruct_v01_wml",
-    "engines.classification.llama_3_1_70b_instruct_wml",
-    # "engines.classification.gpt_4_turbo_openai",
-]
+model_names = ["engines.classification.mixtral_8x7b_instruct_v01_wml"]
 
 if __name__ == "__main__":
     multi_stream = MultiStream.from_iterables({"test": test_examples}, copying=True)
@@ -79,9 +74,8 @@
 
     for metric_name in metric_names:
         for model_name in model_names:
-            # override the metric with the inference model. the default model is llama_3_1_70b_instruct_wml so
-            # no need to override when using it.
-            llmaj_metric_name = f"{metric_name}[model={model_name}]"
+            # override the metric with the inference model (to use a model different from the one in the metric name)
+            llmaj_metric_name = f"{metric_name}[inference_model={model_name}]"
 
             # apply the metric over the input
             metrics_operator = SequentialOperator(steps=[llmaj_metric_name])

diff --git a/examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py b/examples/evaluate_llm_as_judge_direct_criteria_from_dataset.py
@@ -2,7 +2,7 @@
 
 from unitxt import evaluate, load_dataset
 from unitxt.blocks import Task, TaskCard
-from unitxt.llm_as_judge_operators import CreateYesNoCriteriaFromString
+from unitxt.llm_as_judge import CreateYesNoCriteriaFromString
 from unitxt.loaders import LoadFromDictionary
 
 data = {

diff --git a/examples/evaluate_llm_as_judge_direct_predefined_criteria.py b/examples/evaluate_llm_as_judge_direct_predefined_criteria.py
@@ -30,4 +30,4 @@
 print(results.global_scores.summary)
 
 print("Instance Scores:")
-print(results.instance_scores.summary)
+print(results.instance_scores)
diff --git a/examples/evaluate_llm_as_judge_direct_user_criteria_no_catalog.py b/examples/evaluate_llm_as_judge_direct_user_criteria_no_catalog.py
@@ -59,4 +59,4 @@
 print(results.global_scores.summary)
 
 print("Instance Scores:")
-print(results.instance_scores.summary)
+print(results.instance_scores)
diff --git a/examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py b/examples/evaluate_llm_as_judge_pairwise_criteria_from_dataset.py
@@ -2,9 +2,7 @@
 
 from unitxt import evaluate, load_dataset
 from unitxt.blocks import Task, TaskCard
-from unitxt.llm_as_judge_operators import (
-    CreateCriteriaFromString,
-)
+from unitxt.llm_as_judge import CreateCriteriaFromString
 from unitxt.loaders import LoadFromDictionary
 from unitxt.templates import NullTemplate
 

diff --git a/examples/evaluate_llm_as_judge_pairwise_predefined_criteria.py b/examples/evaluate_llm_as_judge_pairwise_predefined_criteria.py
@@ -2,7 +2,7 @@
 
 from unitxt import evaluate, load_dataset
 from unitxt.blocks import Task, TaskCard
-from unitxt.llm_as_judge_operators import LoadCriteria
+from unitxt.llm_as_judge import LoadCriteria
 from unitxt.loaders import LoadFromDictionary
 from unitxt.templates import NullTemplate
 

diff --git a/examples/evaluate_llm_as_judge_pairwise_user_criteria_no_catalog.py b/examples/evaluate_llm_as_judge_pairwise_user_criteria_no_catalog.py
@@ -4,8 +4,7 @@
 from unitxt.api import evaluate, load_dataset
 from unitxt.card import Task, TaskCard
 from unitxt.inference import CrossProviderInferenceEngine
-from unitxt.llm_as_judge import LLMJudgePairwise
-from unitxt.llm_as_judge_operators import CreateCriteriaFromDict
+from unitxt.llm_as_judge import CreateCriteriaFromDict, LLMJudgePairwise
 from unitxt.loaders import LoadFromDictionary
 from unitxt.templates import NullTemplate
 

diff --git a/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py b/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py
@@ -45,11 +45,21 @@
     },
 ]
 
+# select recommended metrics according to your available resources.
+metrics = [
+    "metrics.rag.end_to_end.recommended.cpu_only.all",
+    # "metrics.rag.end_to_end.recommended.small_llm.all",
+    # "metrics.rag.end_to_end.recommended.llmaj_watsonx.all",
+    # "metrics.rag.end_to_end.recommended.llmaj_rits.all"
+    # "metrics.rag.end_to_end.recommended.llmaj_azure.all"
+]
+
 dataset = create_dataset(
     task="tasks.rag.end_to_end",
     test_set=dataset,
     split="test",
     postprocessors=[],
+    metrics=metrics,
 )
 
 results = evaluate(predictions, dataset)

diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py
@@ -58,13 +58,23 @@
     ),
 )
 
+# select recommended metrics according to your available resources.
+metrics = [
+    "metrics.rag.response_generation.recommended.cpu_only.all",
+    # "metrics.rag.response_generation.recommended.small_llm.all",
+    # "metrics.rag.response_generation.recommended.llmaj_watsonx.all",
+    # "metrics.rag.response_generation.recommended.llmaj_rits.all"
+    # "metrics.rag.response_generation.recommended.llmaj_azure.all"
+]
+
 # Verbalize the dataset using the template
 dataset = load_dataset(
     card=card,
     template_card_index="simple",
     format="formats.chat_api",
     split="test",
     max_test_instances=10,
+    metrics=metrics,
 )