From 6dcf08e0e3e4e340b104228fde6ee1d8f4e08e5c Mon Sep 17 00:00:00 2001 From: Lilach Eden <117581332+lilacheden@users.noreply.github.com> Date: Sun, 19 Jan 2025 21:13:11 +0200 Subject: [PATCH] Refactor rag metrics and judges (#1515) --- ...al_rag_results_with_binary_llm_as_judge.py | 22 +- ...d_to_end_dataset_with_given_predictions.py | 10 + examples/evaluate_rag_response_generation.py | 10 + .../classification/classification_engines.py | 57 ++-- prepare/metrics/hhem.py | 26 +- prepare/metrics/llm_as_judge/rag_judge.py | 67 +++- prepare/metrics/rag.py | 27 +- prepare/metrics/rag_answer_correctness.py | 313 +++++++++++------- prepare/metrics/rag_answer_relevance.py | 122 ++++--- prepare/metrics/rag_context_correctness.py | 144 +++++--- prepare/metrics/rag_context_relevance.py | 91 +++-- prepare/metrics/rag_faithfulness.py | 192 ++++++++--- prepare/metrics/rag_metrics_deprecated.py | 227 +++++++++++++ .../metrics/rag_recommended_metric_lists.py | 77 +++++ .../templates/rag_eval/rag_eval_numeric.py | 11 +- .../classification/gpt_4_turbo_open_ai.json | 9 + .../classification/gpt_4o_open_ai.json | 9 + ...llama_3_1_405b_instruct_fp8_ibm_genai.json | 7 - .../llama_3_1_405b_instruct_fp8_rits.json | 8 +- .../llama_3_1_70b_instruct_ibm_genai.json | 7 - .../llama_3_1_70b_instruct_rits.json | 8 +- .../llama_3_1_70b_instruct_watsonx.json | 9 + .../llama_3_3_70b_instruct_rits.json | 8 +- .../llama_3_3_70b_instruct_watsonx.json | 9 + .../mixtral_8x7b_instruct_v01_ibm_genai.json | 7 - .../mixtral_8x7b_instruct_v01_rits.json | 7 - ...nce_engine_context_relevance_q_c_ares.json | 2 +- ...struct_wml_context_relevance_q_c_ares.json | 2 +- ...l_context_relevance_q_c_ares_logprobs.json | 2 +- .../metrics/rag/answer_correctness.json | 3 +- .../answer_correctness/bert_score_recall.json | 3 +- .../bert_score_recall_ml.json | 3 +- .../answer_correctness/sentence_bert_bge.json | 3 +- .../sentence_bert_mini_lm.json | 3 +- .../rag/answer_correctness/token_recall.json | 3 +- .../catalog/metrics/rag/answer_inference.json | 3 +- .../rag/answer_relevance/token_recall.json | 3 +- .../catalog/metrics/rag/answer_reward.json | 3 +- .../metrics/rag/context_correctness.json | 3 +- .../metrics/rag/context_correctness/map.json | 3 +- .../metrics/rag/context_correctness/mrr.json | 3 +- .../context_correctness/retrieval_at_k.json | 3 +- .../metrics/rag/context_perplexity.json | 16 +- .../metrics/rag/context_relevance.json | 3 +- .../generic_inference_engine_q_c_ares.json | 2 +- ...ric_inference_engine_q_c_ares_numeric.json | 2 +- .../llama_3_1_70b_instruct_wml_q_c_ares.json | 2 +- ..._1_70b_instruct_wml_q_c_ares_logprobs.json | 2 +- ...3_1_70b_instruct_wml_q_c_ares_numeric.json | 2 +- .../perplexity_flan_t5_small.json | 3 +- .../context_relevance/sentence_bert_bge.json | 3 +- .../sentence_bert_mini_lm.json | 3 +- .../context_relevance/token_precision.json | 3 +- .../answer_correctness/bert_score_recall.json | 15 + .../bert_score_recall_ml.json | 15 + .../generic_inference_engine_judge.json | 15 + .../gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../answer_correctness/sentence_bert_bge.json | 15 + .../sentence_bert_mini_lm.json | 15 + .../answer_correctness/token_recall.json | 15 + .../answer_relevance/answer_reward.json | 22 ++ .../generic_inference_engine_judge.json | 15 + .../answer_relevance/gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../answer_relevance/token_recall.json | 22 ++ .../end_to_end/context_correctness/map.json | 19 ++ .../end_to_end/context_correctness/mrr.json | 19 ++ .../context_correctness/retrieval_at_k.json | 19 ++ .../generic_inference_engine_judge.json | 15 + .../context_relevance/gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../perplexity_flan_t5_small.json | 18 + .../context_relevance/sentence_bert_bge.json | 18 + .../sentence_bert_mini_lm.json | 18 + .../context_relevance/token_precision.json | 18 + .../faithfulness/bert_score_k_precision.json | 15 + .../bert_score_k_precision_ml.json | 15 + .../generic_inference_engine_judge.json | 15 + .../faithfulness/gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../faithfulness/sentence_bert_bge.json | 15 + .../faithfulness/sentence_bert_mini_lm.json | 15 + .../faithfulness/token_k_precision.json | 15 + .../faithfulness/vectara_hhem_2_1.json | 15 + .../end_to_end/recommended/cpu_only/all.json | 10 + .../recommended/llmaj_azure/all.json | 10 + .../recommended/llmaj_rits/all.json | 10 + .../recommended/llmaj_watsonx/all.json | 10 + .../end_to_end/recommended/small_llm/all.json | 10 + .../rag/external_rag/answer_correctness.json | 15 + .../answer_correctness/bert_score_recall.json | 15 + .../bert_score_recall_ml.json | 15 + .../generic_inference_engine_judge.json | 13 + .../gpt_4o_azure_judge.json | 11 + .../llama_3_3_70b_instruct_rits_judge.json | 11 + .../llama_3_3_70b_instruct_watsonx_judge.json | 11 + .../answer_correctness/sentence_bert_bge.json | 15 + .../sentence_bert_mini_lm.json | 15 + .../answer_correctness/token_recall.json | 15 + .../answer_relevance/answer_reward.json | 22 ++ .../generic_inference_engine_judge.json | 13 + .../answer_relevance/gpt_4o_azure_judge.json | 11 + .../llama_3_3_70b_instruct_rits_judge.json | 11 + .../llama_3_3_70b_instruct_watsonx_judge.json | 11 + .../answer_relevance/token_recall.json | 22 ++ .../rag/external_rag/answer_reward.json | 22 ++ .../rag/external_rag/context_correctness.json | 19 ++ .../external_rag/context_correctness/map.json | 19 ++ .../external_rag/context_correctness/mrr.json | 19 ++ .../context_correctness/retrieval_at_k.json | 19 ++ .../rag/external_rag/context_relevance.json | 18 + .../generic_inference_engine_judge.json | 13 + .../context_relevance/gpt_4o_azure_judge.json | 11 + .../llama_3_3_70b_instruct_rits_judge.json | 11 + .../llama_3_3_70b_instruct_watsonx_judge.json | 11 + .../perplexity_flan_t5_small.json | 18 + .../context_relevance/sentence_bert_bge.json | 18 + .../sentence_bert_mini_lm.json | 18 + .../context_relevance/token_precision.json | 18 + .../rag/external_rag/faithfulness.json | 15 + .../faithfulness/bert_score_k_precision.json | 15 + .../bert_score_k_precision_ml.json | 15 + .../generic_inference_engine_judge.json | 13 + .../faithfulness/gpt_4o_azure_judge.json | 11 + .../llama_3_3_70b_instruct_rits_judge.json | 11 + .../llama_3_3_70b_instruct_watsonx_judge.json | 11 + .../faithfulness/sentence_bert_bge.json | 15 + .../faithfulness/sentence_bert_mini_lm.json | 15 + .../faithfulness/token_k_precision.json | 15 + .../faithfulness/vectara_hhem_2_1.json | 15 + .../recommended/cpu_only/all.json | 10 + .../recommended/llmaj_azure/all.json | 10 + .../recommended/llmaj_rits/all.json | 10 + .../recommended/llmaj_watsonx/all.json | 10 + .../recommended/small_llm/all.json | 10 + .../catalog/metrics/rag/faithfulness.json | 3 +- .../faithfulness/bert_score_k_precision.json | 3 +- .../bert_score_k_precision_ml.json | 3 +- .../rag/faithfulness/sentence_bert_bge.json | 3 +- .../faithfulness/sentence_bert_mini_lm.json | 3 +- .../rag/faithfulness/token_k_precision.json | 3 +- .../rag/faithfulness/vectara_hhem_2_1.json | 6 +- .../answer_correctness/bert_score_recall.json | 14 + .../bert_score_recall_ml.json | 14 + .../generic_inference_engine_judge.json | 15 + .../gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../answer_correctness/sentence_bert_bge.json | 14 + .../sentence_bert_mini_lm.json | 14 + .../answer_correctness/token_recall.json | 14 + .../answer_relevance/answer_reward.json | 21 ++ .../generic_inference_engine_judge.json | 15 + .../answer_relevance/gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../answer_relevance/token_recall.json | 21 ++ .../bert_score/deberta_large_mnli.json | 3 +- .../deberta_v3_base_mnli_xnli_ml.json | 3 +- .../correctness/token_overlap.json | 3 +- .../faithfullness/token_overlap.json | 3 +- .../faithfulness/bert_score_k_precision.json | 14 + .../bert_score_k_precision_ml.json | 14 + .../generic_inference_engine_judge.json | 15 + .../faithfulness/gpt_4o_azure_judge.json | 13 + .../llama_3_3_70b_instruct_rits_judge.json | 13 + .../llama_3_3_70b_instruct_watsonx_judge.json | 13 + .../faithfulness/sentence_bert_bge.json | 14 + .../faithfulness/sentence_bert_mini_lm.json | 14 + .../faithfulness/token_k_precision.json | 14 + .../faithfulness/vectara_hhem_2_1.json | 14 + .../recommended/cpu_only/all.json | 8 + .../recommended/llmaj_azure/all.json | 8 + .../recommended/llmaj_rits/all.json | 8 + .../recommended/llmaj_watsonx/all.json | 8 + .../recommended/small_llm/all.json | 8 + .../vectara_groundedness_hhem_2_1.json | 3 + .../judge_answer_relevance_numeric.json | 2 +- .../judge_no_question_simplified_verbal.json | 2 +- ...o_question_simplified_verbal_good_bad.json | 2 +- ...judge_with_question_simplified_verbal.json | 2 +- ...h_question_simplified_verbal_good_bad.json | 2 +- src/unitxt/inference.py | 17 +- src/unitxt/llm_as_judge_from_template.py | 22 +- src/unitxt/test_utils/metrics.py | 1 + 190 files changed, 2851 insertions(+), 471 deletions(-) create mode 100644 prepare/metrics/rag_metrics_deprecated.py create mode 100644 prepare/metrics/rag_recommended_metric_lists.py create mode 100644 src/unitxt/catalog/engines/classification/gpt_4_turbo_open_ai.json create mode 100644 src/unitxt/catalog/engines/classification/gpt_4o_open_ai.json delete mode 100644 src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_ibm_genai.json delete mode 100644 src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_ibm_genai.json create mode 100644 src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_watsonx.json create mode 100644 src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_watsonx.json delete mode 100644 src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_ibm_genai.json delete mode 100644 src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_rits.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/answer_reward.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/map.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/mrr.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/retrieval_at_k.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/perplexity_flan_t5_small.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/token_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/token_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/vectara_hhem_2_1.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/recommended/cpu_only/all.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_azure/all.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_rits/all.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_watsonx/all.json create mode 100644 src/unitxt/catalog/metrics/rag/end_to_end/recommended/small_llm/all.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/answer_reward.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/answer_reward.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_correctness.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_correctness/map.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_correctness/mrr.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_correctness/retrieval_at_k.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/perplexity_flan_t5_small.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/context_relevance/token_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/token_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/faithfulness/vectara_hhem_2_1.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/recommended/cpu_only/all.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_azure/all.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_rits/all.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_watsonx/all.json create mode 100644 src/unitxt/catalog/metrics/rag/external_rag/recommended/small_llm/all.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/answer_reward.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/token_recall.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision_ml.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/generic_inference_engine_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/gpt_4o_azure_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_rits_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_bge.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_mini_lm.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/token_k_precision.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/faithfulness/vectara_hhem_2_1.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/recommended/cpu_only/all.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_azure/all.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_rits/all.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_watsonx/all.json create mode 100644 src/unitxt/catalog/metrics/rag/response_generation/recommended/small_llm/all.json create mode 100644 src/unitxt/catalog/metrics/vectara_groundedness_hhem_2_1.json diff --git a/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py b/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py index d29ca582f1..e06d553fbf 100644 --- a/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py +++ b/examples/evaluate_external_rag_results_with_binary_llm_as_judge.py @@ -53,22 +53,17 @@ # Select the desired metric(s). # Each metric measures a certain aspect of the generated answer (answer_correctness, faithfulness, -# answer_relevance, context_relevance and correctness_holistic). -# All available metrics are under "catalog.metrics.rag" -# Those with extension "logprobs" provide a real value prediction in [0,1], the others provide a binary prediction. -# By default, all judges use llama_3_1_70b_instruct_wml. We will soon see how to change this. +# answer_relevance and context_relevance). +# All available metrics are under "catalog.metrics.rag.autorag.", ending with "judge" +# By default, all judges use llama_3_3_70b_instruct. We will soon see how to change this. metric_names = [ - "metrics.rag.answer_correctness.llama_3_1_70b_instruct_wml_q_a_gt_loose_logprobs", - "metrics.rag.faithfulness.llama_3_1_70b_instruct_wml_q_c_a_logprobs", + "metrics.rag.autorag.answer_correctness.llama_3_3_70b_instruct_wml_judge", + "metrics.rag.autorag.faithfulness.llama_3_3_70b_instruct_wml_judge", ] # select the desired model. # all available models are under "catalog.engines.classification" -model_names = [ - "engines.classification.mixtral_8x7b_instruct_v01_wml", - "engines.classification.llama_3_1_70b_instruct_wml", - # "engines.classification.gpt_4_turbo_openai", -] +model_names = ["engines.classification.mixtral_8x7b_instruct_v01_wml"] if __name__ == "__main__": multi_stream = MultiStream.from_iterables({"test": test_examples}, copying=True) @@ -79,9 +74,8 @@ for metric_name in metric_names: for model_name in model_names: - # override the metric with the inference model. the default model is llama_3_1_70b_instruct_wml so - # no need to override when using it. - llmaj_metric_name = f"{metric_name}[model={model_name}]" + # override the metric with the inference model (to use a model different from the one in the metric name) + llmaj_metric_name = f"{metric_name}[inference_model={model_name}]" # apply the metric over the input metrics_operator = SequentialOperator(steps=[llmaj_metric_name]) diff --git a/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py b/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py index 2d7e3a5098..ac6399bae8 100644 --- a/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py +++ b/examples/evaluate_rag_end_to_end_dataset_with_given_predictions.py @@ -45,11 +45,21 @@ }, ] +# select recommended metrics according to your available resources. +metrics = [ + "metrics.rag.end_to_end.recommended.cpu_only.all", + # "metrics.rag.end_to_end.recommended.small_llm.all", + # "metrics.rag.end_to_end.recommended.llmaj_watsonx.all", + # "metrics.rag.end_to_end.recommended.llmaj_rits.all" + # "metrics.rag.end_to_end.recommended.llmaj_azure.all" +] + dataset = create_dataset( task="tasks.rag.end_to_end", test_set=dataset, split="test", postprocessors=[], + metrics=metrics, ) results = evaluate(predictions, dataset) diff --git a/examples/evaluate_rag_response_generation.py b/examples/evaluate_rag_response_generation.py index 01249b5b0b..1856f61a40 100644 --- a/examples/evaluate_rag_response_generation.py +++ b/examples/evaluate_rag_response_generation.py @@ -58,6 +58,15 @@ ), ) +# select recommended metrics according to your available resources. +metrics = [ + "metrics.rag.response_generation.recommended.cpu_only.all", + # "metrics.rag.response_generation.recommended.small_llm.all", + # "metrics.rag.response_generation.recommended.llmaj_watsonx.all", + # "metrics.rag.response_generation.recommended.llmaj_rits.all" + # "metrics.rag.response_generation.recommended.llmaj_azure.all" +] + # Verbalize the dataset using the template dataset = load_dataset( card=card, @@ -65,6 +74,7 @@ format="formats.chat_api", split="test", max_test_instances=10, + metrics=metrics, ) diff --git a/prepare/engines/classification/classification_engines.py b/prepare/engines/classification/classification_engines.py index ebfbff69b4..635b203319 100644 --- a/prepare/engines/classification/classification_engines.py +++ b/prepare/engines/classification/classification_engines.py @@ -1,28 +1,35 @@ from unitxt import add_to_catalog from unitxt.inference import ( AzureOpenAIInferenceEngine, - IbmGenAiInferenceEngine, - RITSInferenceEngine, + CrossProviderInferenceEngine, WMLInferenceEngineGeneration, ) +model_names_to_provider = { + "llama-3-3-70b-instruct": ["watsonx", "rits"], + "llama-3-1-70b-instruct": ["watsonx", "rits"], + "gpt-4o": ["open-ai"], + "gpt-4-turbo": ["open-ai"], + "gpt-4-turbo-2024-04-09": ["azure"], + "gpt-4o-2024-08-06": ["azure"], + "mistralai/mixtral-8x7b-instruct-v01": ["ibm_wml"], + "meta-llama/llama-3-3-70b-instruct": ["ibm_wml"], + "meta-llama/llama-3-1-70b-instruct": ["ibm_wml"], + "meta-llama/llama-3-405b-instruct": ["ibm_wml"], + "llama-3-1-405b-instruct-fp8": ["rits"], +} + -def get_inference_engine(model_name, framework_name): - if framework_name == "ibm_wml": +def get_inference_engine(model_name, provider): + if provider == "ibm_wml": return WMLInferenceEngineGeneration( model_name=model_name, max_new_tokens=5, random_seed=42, decoding_method="greedy", ) - if framework_name == "ibm_gen_ai": - return IbmGenAiInferenceEngine( - model_name=model_name, - max_new_tokens=5, - random_seed=42, - decoding_method="greedy", - ) - if framework_name == "openai": + + if provider == "azure": return AzureOpenAIInferenceEngine( model_name=model_name, logprobs=True, @@ -30,27 +37,21 @@ def get_inference_engine(model_name, framework_name): temperature=0.0, top_logprobs=5, ) - if framework_name == "rits": - return RITSInferenceEngine( - model_name=model_name, logprobs=True, max_tokens=5, temperature=0.0 - ) - raise ValueError("Unsupported framework name " + framework_name) + return CrossProviderInferenceEngine( + model=model_name, + logprobs=True, + max_tokens=5, + temperature=0.0, + top_logprobs=5, + provider=provider, + ) -model_names_to_infer_framework = { - "meta-llama/llama-3-1-70b-instruct": ["ibm_wml", "rits", "ibm_gen_ai"], - "meta-llama/llama-3-3-70b-instruct": ["ibm_wml", "rits"], - "gpt-4-turbo-2024-04-09": ["openai"], - "gpt-4o-2024-08-06": ["openai"], - "mistralai/mixtral-8x7b-instruct-v01": ["ibm_wml", "ibm_gen_ai", "rits"], - "meta-llama/llama-3-1-405b-instruct-fp8": ["ibm_gen_ai", "rits"], - "meta-llama/llama-3-405b-instruct": ["ibm_wml"], -} -for judge_model_name, infer_frameworks in model_names_to_infer_framework.items(): +for judge_model_name, infer_frameworks in model_names_to_provider.items(): for infer_framework in infer_frameworks: inference_engine = get_inference_engine(judge_model_name, infer_framework) - inference_engine_label = inference_engine.get_engine_id() + inference_engine_label = inference_engine.get_engine_id().replace("-", "_") add_to_catalog( inference_engine, diff --git a/prepare/metrics/hhem.py b/prepare/metrics/hhem.py index c39899d29d..b090239206 100644 --- a/prepare/metrics/hhem.py +++ b/prepare/metrics/hhem.py @@ -1,6 +1,5 @@ from unitxt import add_to_catalog -from unitxt.metrics import FaithfulnessHHEM, MetricPipeline -from unitxt.operators import Copy +from unitxt.metrics import FaithfulnessHHEM from unitxt.test_utils.metrics import test_metric pairs = [ @@ -12,21 +11,6 @@ predictions = [p[1] for p in pairs] task_data = [{"contexts": [p[0]]} for p in pairs] -## This metric pipeline supports two usecases: -## 1. Regular unitxt flow: predictions are taken from model prediction and contexts appears in the task data -## 2. Running on external rag output: each instance contains field "answer" and field "contexts" -metric = MetricPipeline( - main_score="hhem_score", - preprocess_steps=[ - Copy( - field_to_field={"task_data/contexts": "references", "answer": "prediction"}, - not_exist_do_nothing=True, - ), - Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), - ], - metric=FaithfulnessHHEM(), - __description__="Vectara's halucination detection model, HHEM2.1, compares contexts and generated answer to determine faithfulness.", -) instance_targets = [ {"score": 0.01, "score_name": "hhem_score", "hhem_score": 0.01}, {"score": 0.65, "score_name": "hhem_score", "hhem_score": 0.65}, @@ -43,13 +27,13 @@ "hhem_score_ci_high": 0.65, } - +references = [[p[0]] for p in pairs] +metric = FaithfulnessHHEM() outputs = test_metric( metric=metric, predictions=predictions, - references=[[""]] * len(instance_targets), - task_data=task_data, + references=references, instance_targets=instance_targets, global_target=global_target, ) -add_to_catalog(metric, "metrics.rag.faithfulness.vectara_hhem_2_1", overwrite=True) +add_to_catalog(metric, "metrics.vectara_groundedness_hhem_2_1", overwrite=True) diff --git a/prepare/metrics/llm_as_judge/rag_judge.py b/prepare/metrics/llm_as_judge/rag_judge.py index 423b39a4ab..aa418de2d1 100644 --- a/prepare/metrics/llm_as_judge/rag_judge.py +++ b/prepare/metrics/llm_as_judge/rag_judge.py @@ -1,5 +1,4 @@ from unitxt import add_to_catalog -from unitxt.artifact import UnitxtArtifactNotFoundError, fetch_artifact from unitxt.inference import GenericInferenceEngine from unitxt.llm_as_judge import ( TaskBasedLLMasJudge, @@ -31,12 +30,12 @@ def get_prediction_field(metric_type): - return None if metric_type == "context_relevance" else "answer" + return "contexts" if metric_type == "context_relevance" else "answer" for metric_type, template_dict in metric_type_to_template_dict.items(): for template_short_name, template_name in template_dict.items(): - task_name = f"tasks.rag_eval.{metric_type}.binary" + judge_task_name = f"tasks.rag_eval.{metric_type}.binary" for logprobs_label in [ "", "_logprobs", @@ -46,10 +45,7 @@ def get_prediction_field(metric_type): template = ( f"templates.rag_eval.{metric_type}.{template_name}{logprobs_label}" ) - try: - t = fetch_artifact(template)[0] - except UnitxtArtifactNotFoundError: - continue + for inf_label, inference_model in inference_models.items(): if ( use_logprobs and inf_label == generic_engine_label @@ -60,7 +56,7 @@ def get_prediction_field(metric_type): metric = TaskBasedLLMasJudge( inference_model=inference_model, template=template, - task=task_name, + task=judge_task_name, format=None, main_score=metric_label, prediction_field=get_prediction_field(metric_type), @@ -79,7 +75,7 @@ def get_prediction_field(metric_type): metric = TaskBasedLLMasJudge( inference_model=inference_model, template=template, - task=task_name, + task=judge_task_name, format=None, main_score=metric_label, prediction_field=get_prediction_field(metric_type), @@ -92,3 +88,56 @@ def get_prediction_field(metric_type): f"metrics.llm_as_judge.binary.{inf_label}_{metric_label}", overwrite=True, ) + + +# now add new metrics under unitxt rag tasks +metric_type_to_template_v2 = { + "faithfulness": "judge_with_question_simplified", + "context_relevance": "judge_context_relevance_ares", + "answer_correctness": "judge_loose_match_no_context", + "answer_relevance": "judge_answer_relevance", +} + +inference_models_v2 = { + "llama_3_3_70b_instruct_watsonx": "engines.classification.llama_3_3_70b_instruct_watsonx", + "llama_3_3_70b_instruct_rits": "engines.classification.llama_3_3_70b_instruct_rits", + "gpt_4o_azure": "engines.classification.gpt_4o_2024_08_06_azure_openai", + generic_engine_label: GenericInferenceEngine(), +} + +for metric_type, template_name in metric_type_to_template_v2.items(): + judge_task_name = f"tasks.rag_eval.{metric_type}.binary" + realization_sufffix = metric_type_to_realization[metric_type] + template = f"templates.rag_eval.{metric_type}.{template_name}{realization_sufffix}" + for inf_label, inference_model in inference_models_v2.items(): + for rag_unitxt_task in ["external_rag", "response_generation", "end_to_end"]: + if ( + rag_unitxt_task == "response_generation" + and metric_type == "context_relevance" + ): + continue + + judge_to_generator_fields_mapping = ( + {} + if rag_unitxt_task == "external_rag" + else {"ground_truths": "reference_answers"} + ) + + new_catalog_name = ( + f"metrics.rag.{rag_unitxt_task}.{metric_type}.{inf_label}_judge" + ) + metric = TaskBasedLLMasJudge( + inference_model=inference_model, + template=template, + task=judge_task_name, + format=None, + main_score=f"{metric_type}_judge", + prediction_field=get_prediction_field(metric_type), + infer_log_probs=False, + judge_to_generator_fields_mapping=judge_to_generator_fields_mapping, + ) + add_to_catalog( + metric, + new_catalog_name, + overwrite=True, + ) diff --git a/prepare/metrics/rag.py b/prepare/metrics/rag.py index 8a1f3792c1..1cc3fc2c98 100644 --- a/prepare/metrics/rag.py +++ b/prepare/metrics/rag.py @@ -347,12 +347,24 @@ # metrics.rag.recall # metrics.rag.bert_recall -for axis, base_metric, main_score in [ - ("correctness", "token_overlap", "f1"), - ("correctness", "bert_score.deberta_large_mnli", "recall"), - ("correctness", "bert_score.deberta_v3_base_mnli_xnli_ml", "recall"), - ("faithfullness", "token_overlap", "precision"), +for axis, base_metric, main_score, new_metric in [ + ("correctness", "token_overlap", "f1", "answer_correctness.token_recall"), + ( + "correctness", + "bert_score.deberta_large_mnli", + "recall", + "answer_correctness.bert_score_recall", + ), + ( + "correctness", + "bert_score.deberta_v3_base_mnli_xnli_ml", + "recall", + "answer_correctness.bert_score_recall_ml", + ), + ("faithfullness", "token_overlap", "precision", "faithfulness.token_k_precision"), ]: + deprecated_path = f"metrics.rag.response_generation.{axis}.{base_metric}" + new_metric_path = f"metrics.rag.response_generation.{new_metric}" preprocess_steps = ( [ Copy(field="task_data/contexts", to_field="references"), @@ -379,10 +391,13 @@ ], metric=f"metrics.{base_metric}", prediction_type=str, + __deprecated_msg__=f"Metric {deprecated_path} is deprecated. Please use {new_metric_path} instead.", ) add_to_catalog( - metric, f"metrics.rag.response_generation.{axis}.{base_metric}", overwrite=True + metric, + f"metrics.rag.response_generation.{axis}.{base_metric}", + overwrite=True, ) # end to end diff --git a/prepare/metrics/rag_answer_correctness.py b/prepare/metrics/rag_answer_correctness.py index 4208bc84f6..90b112f44b 100644 --- a/prepare/metrics/rag_answer_correctness.py +++ b/prepare/metrics/rag_answer_correctness.py @@ -1,30 +1,77 @@ from unitxt import add_to_catalog from unitxt.metrics import MetricPipeline from unitxt.operators import Copy, Rename -from unitxt.test_utils.metrics import test_evaluate, test_metric +from unitxt.test_utils.metrics import test_metric + +task_names = ["external_rag", "response_generation", "end_to_end"] +base = "metrics.rag" +default = "token_recall" +dimension = "answer_correctness" + + +def get_scores_prefix(metric_catalog_name, dim_name): + if metric_catalog_name == dim_name: + return f"{dim_name}_" + return f"{dim_name}_{metric_catalog_name}_" + + +def add_scores_prefix_to_target(target, metric_catalog_name, dim_name): + prefix = get_scores_prefix(metric_catalog_name, dim_name) + new_target = { + f"{prefix}" + k + if k not in ["score", "score_name", "num_of_instances"] + and not k.startswith("score") + else k: v + for k, v in target.items() + } + new_target["score_name"] = prefix + new_target["score_name"] + return new_target + + +def get_test_pipeline_task_preprocess_steps(task): + if task == "external_rag": + return [ + Rename(field_to_field={"task_data/ground_truths": "ground_truths"}), + Rename(field_to_field={"task_data/answer": "answer"}), + ] + if task == "response_generation": + return [ + Copy(field_to_field={"task_data/answer": "prediction"}), + Copy( + field_to_field={ + "task_data/ground_truths": "task_data/reference_answers" + } + ), + ] + if task == "end_to_end": + return [ + Copy(field_to_field={"task_data/answer": "prediction/answer"}), + Copy( + field_to_field={ + "task_data/ground_truths": "task_data/reference_answers" + } + ), + ] + raise ValueError(f"Unsupported rag task for {dimension}:{task}") def test_answer_correctness( task_data, catalog_name, global_target, instance_targets, main_score ): - # test the evaluate call - test_evaluate( - global_target, - instance_targets=[ - {"score": instance["score"]} for instance in instance_targets - ], - task_data=task_data, - metric_name=catalog_name, - ) # test using the usual metric pipeline test_pipeline = MetricPipeline( main_score=main_score, - preprocess_steps=[ - Rename(field_to_field={"task_data/ground_truths": "ground_truths"}), - Rename(field_to_field={"task_data/answer": "answer"}), - ], + preprocess_steps=get_test_pipeline_task_preprocess_steps(task), metric=f"{catalog_name}", ) + short_catalog_name = catalog_name.split(".")[-1] + instance_targets = [ + add_scores_prefix_to_target(i, short_catalog_name, dimension) + for i in instance_targets + ] + global_target = add_scores_prefix_to_target( + global_target, short_catalog_name, dimension + ) test_metric( metric=test_pipeline, predictions=[None] * len(instance_targets), @@ -35,41 +82,69 @@ def test_answer_correctness( ) -base = "metrics.rag.answer_correctness" -default = "token_recall" - -for new_catalog_name, base_catalog_name, main_score in [ - ("token_recall", "metrics.token_overlap", "recall"), - ("bert_score_recall", "metrics.bert_score.deberta_large_mnli", "recall"), - ( - "bert_score_recall_ml", - "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", - "recall", - ), - ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), - ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), -]: - metric = MetricPipeline( - main_score=main_score, - preprocess_steps=[ +def get_preprocess_steps(task): + if task == "external_rag": + return [ Copy( field_to_field={ - "task_data/reference_answers": "references", + "ground_truths": "references", "answer": "prediction", }, - not_exist_do_nothing=True, + ) + ] + if task == "response_generation": + return [ + Copy( + field_to_field={ + "task_data/reference_answers": "references", + } ), + ] + if task == "end_to_end": + return [ Copy( - field_to_field={"ground_truths": "references"}, - not_exist_do_nothing=True, + field_to_field={ + "task_data/reference_answers": "references", + "prediction/answer": "prediction", + } ), - ], - metric=base_catalog_name, - ) - add_to_catalog(metric, f"{base}.{new_catalog_name}", overwrite=True) + ] + raise ValueError(f"Unsupported rag task {task}") - if new_catalog_name == default: - add_to_catalog(metric, base, overwrite=True) + +for task in task_names: + preprocess_steps = get_preprocess_steps(task) + for new_catalog_name, base_catalog_name, main_score in [ + ("token_recall", "metrics.token_overlap", "recall"), + ("bert_score_recall", "metrics.bert_score.deberta_large_mnli", "recall"), + ( + "bert_score_recall_ml", + "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "recall", + ), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), + ]: + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=preprocess_steps.copy(), + metric=base_catalog_name, + score_prefix=get_scores_prefix(new_catalog_name, dimension), + ) + add_to_catalog( + metric, + f"{base}.{task}.{dimension}.{new_catalog_name}", + overwrite=True, + ) + + if new_catalog_name == default and task == "external_rag": + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=preprocess_steps.copy(), + metric=base_catalog_name, + score_prefix=f"{dimension}_", + ) + add_to_catalog(metric, f"{base}.{task}.{dimension}", overwrite=True) def test_answer_correctness_sentence_bert(): @@ -85,10 +160,9 @@ def test_answer_correctness_sentence_bert(): "answer": "Here is a dog.", }, ] - test_answer_correctness( task_data, - catalog_name="metrics.rag.answer_correctness.sentence_bert_bge", + catalog_name=f"{base}.{task}.{dimension}.sentence_bert_bge", global_target={ "score": 0.64, "score_ci_high": 0.75, @@ -116,7 +190,7 @@ def test_answer_correctness_sentence_bert(): test_answer_correctness( task_data, - catalog_name="metrics.rag.answer_correctness.sentence_bert_mini_lm", + catalog_name=f"{base}.{task}.{dimension}.sentence_bert_mini_lm", global_target={ "score": 0.17, "score_ci_high": 0.42, @@ -179,13 +253,13 @@ def test_answer_correctness_token_recall(task_data): } for catalog_name, global_target, instance_targets in [ + # ( + # f"{base}.{task}.{dimension}", + # recall_global_target, + # recall_instance_targets, + # ), ( - "metrics.rag.answer_correctness", - recall_global_target, - recall_instance_targets, - ), - ( - "metrics.rag.answer_correctness.token_recall", + f"{base}.{task}.{dimension}.token_recall", recall_global_target, recall_instance_targets, ), @@ -212,85 +286,86 @@ def test_answer_correctness_token_recall(task_data): "answer": "B C D", }, ] -# This test is here since it does not involve any models -test_answer_correctness_token_recall(task_data) + if __name__ == "__main__": # Tests which involve models: test_answer_correctness_sentence_bert() + for task in task_names: + test_answer_correctness_token_recall(task_data) - test_answer_correctness( - task_data, - catalog_name="metrics.rag.answer_correctness.bert_score_recall", - global_target={ - "f1": 0.71, - "f1_ci_high": 0.71, - "f1_ci_low": 0.71, - "precision": 0.74, - "precision_ci_high": 0.77, - "precision_ci_low": 0.71, - "recall": 0.71, - "recall_ci_high": 0.71, - "recall_ci_low": 0.71, - "score": 0.71, - "score_ci_high": 0.71, - "score_ci_low": 0.71, - "score_name": "recall", - "num_of_instances": 2, - }, - instance_targets=[ - { - "f1": 0.71, - "precision": 0.77, - "recall": 0.71, - "score": 0.71, - "score_name": "recall", - }, - { + test_answer_correctness( + task_data, + catalog_name=f"{base}.{task}.{dimension}.bert_score_recall", + global_target={ "f1": 0.71, - "precision": 0.71, + "f1_ci_high": 0.71, + "f1_ci_low": 0.71, + "precision": 0.74, + "precision_ci_high": 0.77, + "precision_ci_low": 0.71, "recall": 0.71, + "recall_ci_high": 0.71, + "recall_ci_low": 0.71, "score": 0.71, + "score_ci_high": 0.71, + "score_ci_low": 0.71, "score_name": "recall", + "num_of_instances": 2, }, - ], - main_score="recall", - ) + instance_targets=[ + { + "f1": 0.71, + "precision": 0.77, + "recall": 0.71, + "score": 0.71, + "score_name": "recall", + }, + { + "f1": 0.71, + "precision": 0.71, + "recall": 0.71, + "score": 0.71, + "score_name": "recall", + }, + ], + main_score="recall", + ) - test_answer_correctness( - task_data, - catalog_name="metrics.rag.answer_correctness.bert_score_recall_ml", - global_target={ - "f1": 0.86, - "f1_ci_high": 0.97, - "f1_ci_low": 0.74, - "precision": 0.86, - "precision_ci_high": 0.97, - "precision_ci_low": 0.74, - "recall": 0.86, - "recall_ci_high": 0.97, - "recall_ci_low": 0.74, - "score": 0.86, - "score_ci_high": 0.97, - "score_ci_low": 0.74, - "score_name": "recall", - "num_of_instances": 2, - }, - instance_targets=[ - { - "f1": 0.97, - "precision": 0.97, - "recall": 0.97, - "score": 0.97, - "score_name": "recall", - }, - { - "f1": 0.74, - "precision": 0.74, - "recall": 0.74, - "score": 0.74, + test_answer_correctness( + task_data, + catalog_name=f"{base}.{task}.{dimension}.bert_score_recall_ml", + global_target={ + "f1": 0.86, + "f1_ci_high": 0.97, + "f1_ci_low": 0.74, + "precision": 0.86, + "precision_ci_high": 0.97, + "precision_ci_low": 0.74, + "recall": 0.86, + "recall_ci_high": 0.97, + "recall_ci_low": 0.74, + "score": 0.86, + "score_ci_high": 0.97, + "score_ci_low": 0.74, "score_name": "recall", + "num_of_instances": 2, }, - ], - main_score="recall", - ) + instance_targets=[ + { + "f1": 0.97, + "precision": 0.97, + "recall": 0.97, + "score": 0.97, + "score_name": "recall", + }, + { + "f1": 0.74, + "precision": 0.74, + "recall": 0.74, + "score": 0.74, + "score_name": "recall", + }, + ], + main_score="recall", + ) diff --git a/prepare/metrics/rag_answer_relevance.py b/prepare/metrics/rag_answer_relevance.py index 0be234d05b..d947898334 100644 --- a/prepare/metrics/rag_answer_relevance.py +++ b/prepare/metrics/rag_answer_relevance.py @@ -4,51 +4,81 @@ ) from unitxt.operators import Copy, ListFieldValues -answer_reward = MetricPipeline( - main_score="score", - preprocess_steps=[ - Copy( - field_to_field={"task_data/question": "references", "answer": "prediction"}, - not_exist_do_nothing=True, - ), - Copy(field_to_field={"question": "references"}, not_exist_do_nothing=True), - # This metric compares the answer (as the prediction) to the question (as the reference). - # We have to wrap the question by a list (otherwise it will be a string), - # because references are expected to be lists - ListFieldValues(fields=["references"], to_field="references"), - ], - metric="metrics.reward.deberta_v3_large_v2", -) -add_to_catalog(answer_reward, "metrics.rag.answer_reward", overwrite=True) +task_names = ["external_rag", "response_generation", "end_to_end"] +base = "metrics.rag" -answer_token_overlap = MetricPipeline( - main_score="recall", - preprocess_steps=[ - Copy( - field_to_field={"task_data/question": "references", "answer": "prediction"}, - not_exist_do_nothing=True, - ), - Copy(field_to_field={"question": "references"}, not_exist_do_nothing=True), - # This metric compares the answer (as the prediction) to the question (as the reference). - # We have to wrap the question by a list (otherwise it will be a string), - # because references are expected to be lists - ListFieldValues(fields=["references"], to_field="references"), - ], - metric="metrics.token_overlap", -) -add_to_catalog( - answer_token_overlap, "metrics.rag.answer_relevance.token_recall", overwrite=True -) -answer_inference = MetricPipeline( - main_score="perplexity", - preprocess_steps=[ - Copy( - field_to_field={"task_data/contexts": "references", "answer": "prediction"}, - not_exist_do_nothing=True, - ), - Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), - ], - metric="metrics.perplexity_nli.t5_nli_mixture", -) -add_to_catalog(answer_inference, "metrics.rag.answer_inference", overwrite=True) +def get_preprocess_steps(task): + # This metric compares the answer (as the prediction) to the question (as the reference). + # We have to wrap the question by a list (otherwise it will be a string), + # because references are expected to be lists + last_step = ListFieldValues(fields=["references"], to_field="references") + if task == "external_rag": + return [ + Copy( + field_to_field={ + "question": "references", + "answer": "prediction", + }, + ), + last_step, + ] + if task == "response_generation": + return [ + Copy( + field_to_field={ + "task_data/question": "references", + } + ), + last_step, + ] + if task == "end_to_end": + return [ + Copy( + field_to_field={ + "task_data/question": "references", + "prediction/answer": "prediction", + } + ), + last_step, + ] + raise ValueError(f"Unsupported rag task {task}") + + +for task in task_names: + answer_reward = MetricPipeline( + main_score="reward_score", + preprocess_steps=get_preprocess_steps(task), + metric="metrics.reward.deberta_v3_large_v2", + score_prefix="answer_relevance_", + ) + add_to_catalog( + answer_reward, f"{base}.{task}.answer_relevance.answer_reward", overwrite=True + ) + if task == "external_rag": + add_to_catalog(answer_reward, f"{base}.{task}.answer_reward", overwrite=True) + + answer_token_overlap = MetricPipeline( + main_score="recall", + preprocess_steps=get_preprocess_steps(task), + metric="metrics.token_overlap", + score_prefix="answer_relevance_token_recall_", + ) + add_to_catalog( + answer_token_overlap, + f"{base}.{task}.answer_relevance.token_recall", + overwrite=True, + ) +# +# answer_inference = MetricPipeline( +# main_score="perplexity", +# preprocess_steps=[ +# Copy( +# field_to_field={"task_data/contexts": "references", "answer": "prediction"}, +# not_exist_do_nothing=True, +# ), +# Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), +# ], +# metric="metrics.perplexity_nli.t5_nli_mixture", +# ) +# add_to_catalog(answer_inference, "metrics.rag.answer_inference", overwrite=True) diff --git a/prepare/metrics/rag_context_correctness.py b/prepare/metrics/rag_context_correctness.py index d2c6490b2a..b716eb41b0 100644 --- a/prepare/metrics/rag_context_correctness.py +++ b/prepare/metrics/rag_context_correctness.py @@ -2,30 +2,91 @@ from unitxt.collections_operators import Wrap from unitxt.metrics import MetricPipeline from unitxt.operators import Copy, Rename -from unitxt.test_utils.metrics import test_evaluate, test_metric +from unitxt.test_utils.metrics import test_metric -base = "metrics.rag.context_correctness" default = "mrr" +base = "metrics.rag" +tasks = ["external_rag", "end_to_end"] +dimension = "context_correctness" + + +def get_scores_prefix(metric_catalog_name, dim_name): + return f"{dim_name}_" + + +def add_scores_prefix_to_target(target, metric_catalog_name, dim_name): + prefix = get_scores_prefix(metric_catalog_name, dim_name) + new_target = { + f"{prefix}" + k + if k not in ["score", "score_name", "num_of_instances"] + and not k.startswith("score") + else k: v + for k, v in target.items() + } + new_target["score_name"] = prefix + new_target["score_name"] + return new_target + + +def get_preprocess_steps(task): + if task == "external_rag": + return [ + Copy(field="context_ids", to_field="prediction"), + Wrap( + field="ground_truths_context_ids", inside="list", to_field="references" + ), + ] + if task == "end_to_end": + return [ + Copy(field="prediction/context_ids", to_field="prediction"), + Wrap( + field="task_data/reference_context_ids", + inside="list", + to_field="references", + ), + ] + raise ValueError(f"Unsupported rag task {task}") + + +def get_test_pipeline_task_preprocess_steps(task): + if task == "external_rag": + return [ + Rename(field_to_field={"task_data/context_ids": "context_ids"}), + Rename( + field_to_field={ + "task_data/ground_truths_context_ids": "ground_truths_context_ids" + } + ), + ] + if task == "end_to_end": + return [ + Rename(field_to_field={"task_data/context_ids": "prediction/context_ids"}), + Rename( + field_to_field={ + "task_data/ground_truths_context_ids": "task_data/reference_context_ids" + } + ), + ] + raise ValueError(f"Unsupported rag task for {dimension}:{task}") + for new_catalog_name, base_catalog_name, main_score in [ ("mrr", "metrics.mrr", "mrr"), ("map", "metrics.map", "map"), ("retrieval_at_k", "metrics.retrieval_at_k", "match_at_1"), ]: - metric = MetricPipeline( - main_score=main_score, - preprocess_steps=[ - Copy(field="context_ids", to_field="prediction"), - Wrap( - field="ground_truths_context_ids", inside="list", to_field="references" - ), - ], - metric=base_catalog_name, - ) - add_to_catalog(metric, f"{base}.{new_catalog_name}", overwrite=True) + for task in tasks: + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=get_preprocess_steps(task).copy(), + metric=base_catalog_name, + score_prefix=get_scores_prefix(new_catalog_name, dimension), + ) + add_to_catalog( + metric, f"{base}.{task}.{dimension}.{new_catalog_name}", overwrite=True + ) - if new_catalog_name == default: - add_to_catalog(metric, base, overwrite=True) + if new_catalog_name == default and task == "external_rag": + add_to_catalog(metric, f"{base}.{task}.{dimension}", overwrite=True) def test_context_correctness(): @@ -158,53 +219,54 @@ def test_context_correctness(): for catalog_name, global_target, instance_targets, main_score in [ ( - "metrics.rag.context_correctness.map", + f"{base}.{task}.{dimension}.map", map_global_target, map_instance_targets, "map", ), ( - "metrics.rag.context_correctness.mrr", + f"{base}.{task}.{dimension}.mrr", mrr_global_target, mrr_instance_targets, "mrr", ), + # ( + # f"{base}.{task}.{dimension}", + # mrr_global_target, + # mrr_instance_targets, + # "mrr", + # ), ( - "metrics.rag.context_correctness", - mrr_global_target, - mrr_instance_targets, - "mrr", - ), - ( - "metrics.rag.context_correctness.retrieval_at_k", + f"{base}.{task}.{dimension}.retrieval_at_k", retrieval_at_k_global_target, retrieval_at_k_instance_targets, "match_at_1", ), ]: - # test the evaluate call - test_evaluate( - global_target, - instance_targets=[ - {"score": instance["score"]} for instance in instance_targets - ], - task_data=task_data, - metric_name=catalog_name, - ) + # # test the evaluate call + # test_evaluate( + # global_target, + # instance_targets=[ + # {"score": instance["score"]} for instance in instance_targets + # ], + # task_data=task_data, + # metric_name=catalog_name, + # ) # test using the usual metric pipeline test_pipeline = MetricPipeline( main_score=main_score, - preprocess_steps=[ - Rename(field_to_field={"task_data/context_ids": "context_ids"}), - Rename( - field_to_field={ - "task_data/ground_truths_context_ids": "ground_truths_context_ids" - } - ), - ], + preprocess_steps=get_test_pipeline_task_preprocess_steps(task), metric=f"{catalog_name}", ) + short_catalog_name = catalog_name.split(".")[-1] + instance_targets = [ + add_scores_prefix_to_target(i, short_catalog_name, dimension) + for i in instance_targets + ] + global_target = add_scores_prefix_to_target( + global_target, short_catalog_name, dimension + ) test_metric( metric=test_pipeline, predictions=[None, None], diff --git a/prepare/metrics/rag_context_relevance.py b/prepare/metrics/rag_context_relevance.py index 71e1dc3df7..6833d8ce65 100644 --- a/prepare/metrics/rag_context_relevance.py +++ b/prepare/metrics/rag_context_relevance.py @@ -4,43 +4,62 @@ ) from unitxt.operators import Copy -base = "metrics.rag.context_relevance" +base = "metrics.rag" +tasks = ["external_rag", "end_to_end"] default = "perplexity_flan_t5_small" +dimension = "context_relevance" -for new_catalog_name, base_catalog_name, main_score in [ - ("perplexity_flan_t5_small", "metrics.perplexity_q.flan_t5_small", "perplexity"), - ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), - ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), - ("token_precision", "metrics.token_overlap", "precision"), -]: - metric = MetricPipeline( - main_score=main_score, - preprocess_steps=[ - Copy( - field_to_field={ - "task_data/contexts": "references", - "question": "prediction", - }, - not_exist_do_nothing=True, - ), - Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), - ], - metric=base_catalog_name, - ) - add_to_catalog(metric, f"{base}.{new_catalog_name}", overwrite=True) - if new_catalog_name == default: - add_to_catalog(metric, base, overwrite=True) +def get_preprocess_steps(task): + if task == "external_rag": + return [ + Copy(field="contexts", to_field="references"), + Copy(field="question", to_field="prediction"), + ] + if task == "end_to_end": + return [ + Copy(field="prediction/contexts", to_field="references"), + Copy(field="task_data/question", to_field="prediction"), + ] + raise ValueError(f"Unsupported rag task for {dimension}:{task}") -context_perplexity = MetricPipeline( - main_score="score", - preprocess_steps=[ - Copy(field="contexts", to_field="references"), - Copy(field="question", to_field="prediction"), - ], - metric="metrics.perplexity_q.flan_t5_small", - postprocess_steps=[ - Copy(field="score/instance/reference_scores", to_field="score/instance/score") - ], -) -add_to_catalog(context_perplexity, "metrics.rag.context_perplexity", overwrite=True) + +for task in tasks: + for new_catalog_name, base_catalog_name, main_score in [ + ( + "perplexity_flan_t5_small", + "metrics.perplexity_q.flan_t5_small", + "perplexity", + ), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), + ("token_precision", "metrics.token_overlap", "precision"), + ]: + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=get_preprocess_steps(task).copy(), + metric=base_catalog_name, + score_prefix=f"{dimension}_{new_catalog_name}_", + ) + add_to_catalog( + metric, f"{base}.{task}.{dimension}.{new_catalog_name}", overwrite=True + ) + + if new_catalog_name == default and task == "external_rag": + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=get_preprocess_steps(task).copy(), + metric=base_catalog_name, + score_prefix=f"{dimension}_", + ) + add_to_catalog(metric, f"{base}.{task}.{dimension}", overwrite=True) +# +# context_perplexity = MetricPipeline( +# main_score="score", +# preprocess_steps=get_test_pipeline_task_preprocess_steps(task).copy(), +# metric="metrics.perplexity_q.flan_t5_small", +# postprocess_steps=[ +# Copy(field="score/instance/reference_scores", to_field="score/instance/score") +# ], +# ) +# add_to_catalog(context_perplexity, "metrics.rag.context_perplexity", overwrite=True) diff --git a/prepare/metrics/rag_faithfulness.py b/prepare/metrics/rag_faithfulness.py index eaf2f4be60..e33d192c4b 100644 --- a/prepare/metrics/rag_faithfulness.py +++ b/prepare/metrics/rag_faithfulness.py @@ -3,63 +3,145 @@ MetricPipeline, ) from unitxt.operators import Copy, Rename -from unitxt.test_utils.metrics import test_evaluate, test_metric +from unitxt.test_utils.metrics import test_metric -base = "metrics.rag.faithfulness" +base = "metrics.rag" default = "token_k_precision" +dimension = "faithfulness" +task_names = ["external_rag", "response_generation", "end_to_end"] -for new_catalog_name, base_catalog_name, main_score in [ - ("token_k_precision", "metrics.token_overlap", "precision"), - ("bert_score_k_precision", "metrics.bert_score.deberta_large_mnli", "precision"), - ( - "bert_score_k_precision_ml", - "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", - "precision", - ), - ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), - ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), -]: - metric = MetricPipeline( - main_score=main_score, - preprocess_steps=[ + +def get_scores_prefix(metric_catalog_name, dim_name): + if metric_catalog_name == dim_name: + return f"{dim_name}_" + return f"{dim_name}_{metric_catalog_name}_" + + +def add_scores_prefix_to_target(target, metric_catalog_name, dim_name): + prefix = get_scores_prefix(metric_catalog_name, dim_name) + new_target = { + f"{prefix}" + k + if k not in ["score", "score_name", "num_of_instances"] + and not k.startswith("score") + else k: v + for k, v in target.items() + } + new_target["score_name"] = prefix + new_target["score_name"] + return new_target + + +def get_preprocess_steps(task): + if task == "external_rag": + return [ Copy( field_to_field={ - "task_data/contexts": "references", + "contexts": "references", "answer": "prediction", }, - not_exist_do_nothing=True, ), - Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), - ], - metric=base_catalog_name, - ) - add_to_catalog(metric, f"{base}.{new_catalog_name}", overwrite=True) + ] + if task == "response_generation": + return [ + Copy( + field_to_field={ + "task_data/contexts": "references", + } + ), + ] + if task == "end_to_end": + return [ + Copy( + field_to_field={ + "prediction/contexts": "references", + "prediction/answer": "prediction", + } + ), + ] + raise ValueError(f"Unsupported rag task {task}") - if new_catalog_name == default: - add_to_catalog(metric, base, overwrite=True) + +def get_test_pipeline_task_preprocess_steps(task): + if task == "external_rag": + return [ + Rename(field_to_field={"task_data/contexts": "contexts"}), + Rename(field_to_field={"task_data/answer": "answer"}), + ] + if task == "response_generation": + return [ + Copy(field_to_field={"task_data/answer": "prediction"}), + ] + if task == "end_to_end": + return [ + Copy(field_to_field={"task_data/answer": "prediction/answer"}), + Copy(field_to_field={"task_data/contexts": "prediction/contexts"}), + ] + raise ValueError(f"Unsupported rag task {task}") + + +for task in task_names: + for new_catalog_name, base_catalog_name, main_score in [ + ("token_k_precision", "metrics.token_overlap", "precision"), + ( + "bert_score_k_precision", + "metrics.bert_score.deberta_large_mnli", + "precision", + ), + ( + "bert_score_k_precision_ml", + "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "precision", + ), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), + ("vectara_hhem_2_1", "metrics.vectara_groundedness_hhem_2_1", "hhem_score"), + ]: + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=get_preprocess_steps(task), + metric=base_catalog_name, + score_prefix=get_scores_prefix(new_catalog_name, dimension), + ) + add_to_catalog( + metric, f"{base}.{task}.{dimension}.{new_catalog_name}", overwrite=True + ) + + if new_catalog_name == default and task == "external_rag": + metric = MetricPipeline( + main_score=main_score, + preprocess_steps=get_preprocess_steps(task), + metric=base_catalog_name, + score_prefix=f"{dimension}_", + ) + add_to_catalog(metric, f"{base}.{task}.{dimension}", overwrite=True) def test_faithfulness( - task_data, catalog_name, global_target, instance_targets, main_score + task_data, catalog_name, global_target, instance_targets, main_score, task ): + # print(catalog_name) # test the evaluate call - test_evaluate( - global_target, - instance_targets=[ - {"score": instance["score"]} for instance in instance_targets - ], - task_data=task_data, - metric_name=catalog_name, - ) + # test_evaluate( + # global_target, + # instance_targets=[ + # {"score": instance["score"]} for instance in instance_targets + # ], + # task_data=task_data, + # metric_name=catalog_name, + # ) # test using the usual metric pipeline test_pipeline = MetricPipeline( main_score=main_score, - preprocess_steps=[ - Rename(field_to_field={"task_data/contexts": "contexts"}), - Rename(field_to_field={"task_data/answer": "answer"}), - ], + preprocess_steps=get_test_pipeline_task_preprocess_steps(task), metric=f"{catalog_name}", ) + short_catalog_name = catalog_name.split(".")[-1] + instance_targets = [ + add_scores_prefix_to_target(i, short_catalog_name, dimension) + for i in instance_targets + ] + global_target = add_scores_prefix_to_target( + global_target, short_catalog_name, dimension + ) test_metric( metric=test_pipeline, predictions=[None] * len(instance_targets), @@ -70,7 +152,7 @@ def test_faithfulness( ) -def test_faithfulness_sentence_bert(): +def test_faithfulness_sentence_bert(task): task_data = [ { # Similar sentences @@ -86,7 +168,7 @@ def test_faithfulness_sentence_bert(): test_faithfulness( task_data, - catalog_name="metrics.rag.faithfulness.sentence_bert_bge", + catalog_name=f"{base}.{task}.{dimension}.sentence_bert_bge", global_target={ "score": 0.64, "score_ci_high": 0.75, @@ -110,11 +192,12 @@ def test_faithfulness_sentence_bert(): }, ], main_score="sbert_score", + task=task, ) test_faithfulness( task_data, - catalog_name="metrics.rag.faithfulness.sentence_bert_mini_lm", + catalog_name=f"{base}.{task}.{dimension}.sentence_bert_mini_lm", global_target={ "score": 0.17, "score_ci_high": 0.42, @@ -138,10 +221,11 @@ def test_faithfulness_sentence_bert(): }, ], main_score="sbert_score", + task=task, ) -def test_faithfulness_token_k_precision(): +def test_faithfulness_token_k_precision(task): # don't use "A" as a token because it is considered an article and removed by the token overlap # metric @@ -192,13 +276,13 @@ def test_faithfulness_token_k_precision(): } for catalog_name, global_target, instance_targets in [ + # ( + # f"{base}.{task}.{dimension}", + # precision_global_target, + # precision_instance_targets, + # ), ( - base, - precision_global_target, - precision_instance_targets, - ), - ( - f"{base}.{default}", + f"{base}.{task}.{dimension}.{default}", precision_global_target, precision_instance_targets, ), @@ -209,12 +293,14 @@ def test_faithfulness_token_k_precision(): global_target, instance_targets, main_score="precision", + task=task, ) -# This test is here since it does not involve any models -test_faithfulness_token_k_precision() - if __name__ == "__main__": - # Tests which involve models: - test_faithfulness_sentence_bert() + for task in task_names: + # This test does not involve any models + test_faithfulness_token_k_precision(task) + + # Tests which involve models: + test_faithfulness_sentence_bert(task) diff --git a/prepare/metrics/rag_metrics_deprecated.py b/prepare/metrics/rag_metrics_deprecated.py new file mode 100644 index 0000000000..7b769df48a --- /dev/null +++ b/prepare/metrics/rag_metrics_deprecated.py @@ -0,0 +1,227 @@ +from unitxt import add_to_catalog +from unitxt.collections_operators import Wrap +from unitxt.metrics import MetricPipeline +from unitxt.operators import Copy, ListFieldValues + +base = "metrics.rag" +new_base = "metrics.rag.external_rag" + + +def add_metric_pipeline_to_catalog( + metric_main_score, + metric_preprocess_steps, + orig_metric_catalog_name, + metric_dimension, + metric_new_catalog_name="", +): + metric_path = f"{base}.{metric_dimension}.{metric_new_catalog_name}".strip(".") + new_metric_path = get_replacing_metric(metric_path) + metric = MetricPipeline( + main_score=metric_main_score, + preprocess_steps=metric_preprocess_steps, + metric=orig_metric_catalog_name, + __deprecated_msg__=f"This metric should be replaced with {new_metric_path}", + ) + add_to_catalog(metric, metric_path, overwrite=True) + + +def add_to_catalog_with_default( + metric_main_score, + metric_preprocess_steps, + orig_metric_catalog_name, + metric_dimension, + metric_new_catalog_name, + default_metric_name, +): + add_metric_pipeline_to_catalog( + metric_main_score, + metric_preprocess_steps, + orig_metric_catalog_name, + metric_dimension, + metric_new_catalog_name, + ) + if new_catalog_name == default_metric_name: + add_metric_pipeline_to_catalog( + metric_main_score, + metric_preprocess_steps, + orig_metric_catalog_name, + metric_dimension, + ) + + +def get_replacing_metric(depr_metric): + return depr_metric.replace(base, new_base) + + +default_ac = "token_recall" +preprocess_steps = [ + Copy( + field_to_field={ + "task_data/reference_answers": "references", + "answer": "prediction", + }, + not_exist_do_nothing=True, + ), + Copy( + field_to_field={"ground_truths": "references"}, + not_exist_do_nothing=True, + ), +] +for new_catalog_name, base_catalog_name, main_score in [ + ("token_recall", "metrics.token_overlap", "recall"), + ("bert_score_recall", "metrics.bert_score.deberta_large_mnli", "recall"), + ( + "bert_score_recall_ml", + "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "recall", + ), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), +]: + add_to_catalog_with_default( + main_score, + preprocess_steps, + base_catalog_name, + "answer_correctness", + new_catalog_name, + default_ac, + ) + + +################## +# answer_relevance +################## +answer_relevance_preprocess_steps = preprocess_steps = [ + Copy( + field_to_field={"task_data/question": "references", "answer": "prediction"}, + not_exist_do_nothing=True, + ), + Copy(field_to_field={"question": "references"}, not_exist_do_nothing=True), + # This metric compares the answer (as the prediction) to the question (as the reference). + # We have to wrap the question by a list (otherwise it will be a string), + # because references are expected to be lists + ListFieldValues(fields=["references"], to_field="references"), +] +add_metric_pipeline_to_catalog( + "score", preprocess_steps, "metrics.reward.deberta_v3_large_v2", "answer_reward" +) +add_metric_pipeline_to_catalog( + "recall", + preprocess_steps, + "metrics.token_overlap", + "answer_relevance", + "token_recall", +) + +answer_inference = MetricPipeline( + main_score="perplexity", + preprocess_steps=[ + Copy( + field_to_field={"task_data/contexts": "references", "answer": "prediction"}, + not_exist_do_nothing=True, + ), + Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), + ], + metric="metrics.perplexity_nli.t5_nli_mixture", + __deprecated_msg__="This metric is deprecated", +) +add_to_catalog(answer_inference, "metrics.rag.answer_inference", overwrite=True) + + +##################### +# context correctness +##################### +default_context_correctness = "mrr" +preprocess_steps = [ + Copy(field="context_ids", to_field="prediction"), + Wrap(field="ground_truths_context_ids", inside="list", to_field="references"), +] +for new_catalog_name, base_catalog_name, main_score in [ + ("mrr", "metrics.mrr", "mrr"), + ("map", "metrics.map", "map"), + ("retrieval_at_k", "metrics.retrieval_at_k", "match_at_1"), +]: + add_to_catalog_with_default( + main_score, + preprocess_steps, + base_catalog_name, + "context_correctness", + new_catalog_name, + default_context_correctness, + ) + + +#################### +# Context Relevance +#################### +default_context_relevance = "perplexity_flan_t5_small" +preprocess_steps = [ + Copy( + field_to_field={ + "task_data/contexts": "references", + "question": "prediction", + }, + not_exist_do_nothing=True, + ), + Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), +] +for new_catalog_name, base_catalog_name, main_score in [ + ("perplexity_flan_t5_small", "metrics.perplexity_q.flan_t5_small", "perplexity"), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), + ("token_precision", "metrics.token_overlap", "precision"), +]: + add_to_catalog_with_default( + main_score, + preprocess_steps, + base_catalog_name, + "context_relevance", + new_catalog_name, + default_context_relevance, + ) + +context_perplexity = MetricPipeline( + main_score="score", + preprocess_steps=preprocess_steps, + metric="metrics.perplexity_q.flan_t5_small", + postprocess_steps=[ + Copy(field="score/instance/reference_scores", to_field="score/instance/score") + ], + __deprecated_msg__="This metric is deprecated. Use metrics.rag.external_rag.context_relevance instead.", +) +add_to_catalog(context_perplexity, "metrics.rag.context_perplexity", overwrite=True) + +############## +# faithfulness +############## +default_faithfulness = "token_k_precision" +preprocess_steps = [ + Copy( + field_to_field={ + "task_data/contexts": "references", + "answer": "prediction", + }, + not_exist_do_nothing=True, + ), + Copy(field_to_field={"contexts": "references"}, not_exist_do_nothing=True), +] +for new_catalog_name, base_catalog_name, main_score in [ + ("token_k_precision", "metrics.token_overlap", "precision"), + ("bert_score_k_precision", "metrics.bert_score.deberta_large_mnli", "precision"), + ( + "bert_score_k_precision_ml", + "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "precision", + ), + ("sentence_bert_bge", "metrics.sentence_bert.bge_large_en_1_5", "sbert_score"), + ("sentence_bert_mini_lm", "metrics.sentence_bert.minilm_l12_v2", "sbert_score"), + ("vectara_hhem_2_1", "metrics.vectara_groundedness_hhem_2_1", "hhem_score"), +]: + add_to_catalog_with_default( + main_score, + preprocess_steps, + base_catalog_name, + "faithfulness", + new_catalog_name, + default_faithfulness, + ) diff --git a/prepare/metrics/rag_recommended_metric_lists.py b/prepare/metrics/rag_recommended_metric_lists.py new file mode 100644 index 0000000000..cec3f563fe --- /dev/null +++ b/prepare/metrics/rag_recommended_metric_lists.py @@ -0,0 +1,77 @@ +from unitxt import add_to_catalog +from unitxt.metrics import MetricsList + +recommended_metrics = { + "cpu_only": { + "answer_correctness": "token_recall", + "faithfulness": "token_k_precision", + "answer_relevance": "token_recall", + "context_relevance": "token_precision", + "context_correctness": "mrr", + }, + "small_llm": { + "answer_correctness": "bert_score_recall_ml", + "faithfulness": "vectara_hhem_2_1", + "answer_relevance": "answer_reward", + "context_relevance": "sentence_bert_mini_lm", + "context_correctness": "mrr", + }, + "llmaj_watsonx": { + "answer_correctness": "llama_3_3_70b_instruct_watsonx_judge", + "faithfulness": "llama_3_3_70b_instruct_watsonx_judge", + "answer_relevance": "llama_3_3_70b_instruct_watsonx_judge", + "context_relevance": "llama_3_3_70b_instruct_watsonx_judge", + "context_correctness": "mrr", + }, + "llmaj_rits": { + "answer_correctness": "llama_3_3_70b_instruct_rits_judge", + "faithfulness": "llama_3_3_70b_instruct_rits_judge", + "answer_relevance": "llama_3_3_70b_instruct_rits_judge", + "context_relevance": "llama_3_3_70b_instruct_rits_judge", + "context_correctness": "mrr", + }, + "llmaj_azure": { + "answer_correctness": "gpt_4o_azure_judge", + "faithfulness": "gpt_4o_azure_judge", + "answer_relevance": "gpt_4o_azure_judge", + "context_relevance": "gpt_4o_azure_judge", + "context_correctness": "mrr", + }, +} + + +def get_metrics_types_per_task(unitxt_task): + metric_types = ["answer_correctness", "faithfulness", "answer_relevance"] + if unitxt_task != "response_generation": + metric_types.extend(["context_relevance", "context_correctness"]) + return metric_types + + +def get_recommended_metrics(resources_string, rag_unitxt_task): + recommended_metrics_types_to_names = recommended_metrics[resources_string] + metric_types = get_metrics_types_per_task(rag_unitxt_task) + recommended_metrics_types_to_names = dict( + filter( + lambda x: x[0] in metric_types, recommended_metrics_types_to_names.items() + ) + ) + return [ + f"metrics.rag.{rag_unitxt_task}.{k}.{v}" + for k, v in recommended_metrics_types_to_names.items() + ] + + +def register_recommended_metric_lists(): + for resource_str in recommended_metrics.keys(): + for rag_unitxt_task in ["response_generation", "end_to_end", "external_rag"]: + metrics = MetricsList( + get_recommended_metrics(resource_str, rag_unitxt_task) + ) + add_to_catalog( + metrics, + f"metrics.rag.{rag_unitxt_task}.recommended.{resource_str}.all", + overwrite=True, + ) + + +register_recommended_metric_lists() diff --git a/prepare/templates/rag_eval/rag_eval_numeric.py b/prepare/templates/rag_eval/rag_eval_numeric.py index 93f4ffba92..948432fa49 100644 --- a/prepare/templates/rag_eval/rag_eval_numeric.py +++ b/prepare/templates/rag_eval/rag_eval_numeric.py @@ -124,6 +124,7 @@ def add_rag_templates( faithfilness_instructions_with_question_simplified_category = """You are given a question, the corresponding evidence and a prediction from a model. Compare the "Prediction" and the "Evidence" to determine to what extent the prediction is grounded in the evidence. To be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n +Base your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded. The question is only given for context, and is irrelevant for determining the groundedness of the prediction. Reply with one of the 4 options, without any further explanations: "Completely Grounded" - if the prediction is grounded in the evidence. @@ -134,6 +135,7 @@ def add_rag_templates( faithfulness_instructions_no_question_simplified_category = """You are given a grounding evidence and a prediction from a model. Compare the "Prediction" and the "Evidence" to determine to what extent the prediction is grounded in the evidence. To be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n +Base your answer only on the information in the evidence. If the prediction is correct but not present in the evidence than it is not grounded. Reply with one of the 4 options, without any further explanations: "Completely Grounded" - if the prediction is grounded in the evidence. "Mostly grounded" - if the vast majority of the information in the prediction is grounded in the evidence, but there is a small or negligible part of the prediction which is not present in the evidence. @@ -144,6 +146,7 @@ def add_rag_templates( faithfilness_instructions_with_question_simplified_verbal_good_bad = """You are given a question, the corresponding evidence and a prediction from a model. Compare the "Prediction" and the "Evidence" to determine whether or not the prediction is grounded in the evidence. A good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n +Base your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded. The question is only given for context, and is irrelevant for determining the groundedness of the prediction. Provide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'. """ @@ -151,6 +154,7 @@ def add_rag_templates( faithfulness_instructions_no_question_simplified_verbal_good_bad = """You are given a grounding evidence and a prediction from a model. Compare the "Prediction" and the "Evidence" to determine whether or not the prediction is grounded in the evidence. A good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n +Base your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded. Provide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'. """ @@ -158,6 +162,11 @@ def add_rag_templates( Provide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'. """ +answer_relevance_instructions_numeric = """You are given a question and a prediction from a model. Please determine whether or not the prediction answers the question. +The prediction answers the question if it provides a relevant, appropriate and and complete answer to the question. +Ignore the correctness of the prediction in your judgement. +On a scale of 0 to 10, to what extent does the prediction answer the question? Reply with your rating score without any preceding explanation.\n""" + answer_relevance_instructions_verbal_good_bad = """You are given a question and a prediction from a model. Please determine to what extent, on a scale of 0 to 10, the prediction answers the question. Reply with your rating score without any preceding explanation.\n""" @@ -239,7 +248,7 @@ def add_rag_templates( answer_relevance_templates_dict = { "judge_answer_relevance_numeric": { "input": answer_relevance_input_format, - "instruction": answer_relevance_instructions, + "instruction": answer_relevance_instructions_numeric, "template_type": "numeric", }, "judge_answer_relevance_verbal_good_bad": { diff --git a/src/unitxt/catalog/engines/classification/gpt_4_turbo_open_ai.json b/src/unitxt/catalog/engines/classification/gpt_4_turbo_open_ai.json new file mode 100644 index 0000000000..454daaa7b9 --- /dev/null +++ b/src/unitxt/catalog/engines/classification/gpt_4_turbo_open_ai.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "gpt-4-turbo", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "open-ai" +} diff --git a/src/unitxt/catalog/engines/classification/gpt_4o_open_ai.json b/src/unitxt/catalog/engines/classification/gpt_4o_open_ai.json new file mode 100644 index 0000000000..1960da5d01 --- /dev/null +++ b/src/unitxt/catalog/engines/classification/gpt_4o_open_ai.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "gpt-4o", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "open-ai" +} diff --git a/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_ibm_genai.json b/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_ibm_genai.json deleted file mode 100644 index ab320c2f46..0000000000 --- a/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_ibm_genai.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "__type__": "ibm_gen_ai_inference_engine", - "model_name": "meta-llama/llama-3-1-405b-instruct-fp8", - "max_new_tokens": 5, - "random_seed": 42, - "decoding_method": "greedy" -} diff --git a/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_rits.json b/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_rits.json index b1a814ea9d..679745f112 100644 --- a/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_rits.json +++ b/src/unitxt/catalog/engines/classification/llama_3_1_405b_instruct_fp8_rits.json @@ -1,7 +1,9 @@ { - "__type__": "rits_inference_engine", - "model_name": "meta-llama/llama-3-1-405b-instruct-fp8", + "__type__": "cross_provider_inference_engine", + "model": "llama-3-1-405b-instruct-fp8", "logprobs": true, "max_tokens": 5, - "temperature": 0.0 + "temperature": 0.0, + "top_logprobs": 5, + "provider": "rits" } diff --git a/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_ibm_genai.json b/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_ibm_genai.json deleted file mode 100644 index 43f892ad79..0000000000 --- a/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_ibm_genai.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "__type__": "ibm_gen_ai_inference_engine", - "model_name": "meta-llama/llama-3-1-70b-instruct", - "max_new_tokens": 5, - "random_seed": 42, - "decoding_method": "greedy" -} diff --git a/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_rits.json b/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_rits.json index 814f9641f8..bddbdaa26a 100644 --- a/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_rits.json +++ b/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_rits.json @@ -1,7 +1,9 @@ { - "__type__": "rits_inference_engine", - "model_name": "meta-llama/llama-3-1-70b-instruct", + "__type__": "cross_provider_inference_engine", + "model": "llama-3-1-70b-instruct", "logprobs": true, "max_tokens": 5, - "temperature": 0.0 + "temperature": 0.0, + "top_logprobs": 5, + "provider": "rits" } diff --git a/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_watsonx.json b/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_watsonx.json new file mode 100644 index 0000000000..7c28097e95 --- /dev/null +++ b/src/unitxt/catalog/engines/classification/llama_3_1_70b_instruct_watsonx.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "llama-3-1-70b-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "watsonx" +} diff --git a/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_rits.json b/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_rits.json index ebc12f7129..4ccb021a5e 100644 --- a/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_rits.json +++ b/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_rits.json @@ -1,7 +1,9 @@ { - "__type__": "rits_inference_engine", - "model_name": "meta-llama/llama-3-3-70b-instruct", + "__type__": "cross_provider_inference_engine", + "model": "llama-3-3-70b-instruct", "logprobs": true, "max_tokens": 5, - "temperature": 0.0 + "temperature": 0.0, + "top_logprobs": 5, + "provider": "rits" } diff --git a/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_watsonx.json b/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_watsonx.json new file mode 100644 index 0000000000..257d129e0a --- /dev/null +++ b/src/unitxt/catalog/engines/classification/llama_3_3_70b_instruct_watsonx.json @@ -0,0 +1,9 @@ +{ + "__type__": "cross_provider_inference_engine", + "model": "llama-3-3-70b-instruct", + "logprobs": true, + "max_tokens": 5, + "temperature": 0.0, + "top_logprobs": 5, + "provider": "watsonx" +} diff --git a/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_ibm_genai.json b/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_ibm_genai.json deleted file mode 100644 index 136476e16a..0000000000 --- a/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_ibm_genai.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "__type__": "ibm_gen_ai_inference_engine", - "model_name": "mistralai/mixtral-8x7b-instruct-v01", - "max_new_tokens": 5, - "random_seed": 42, - "decoding_method": "greedy" -} diff --git a/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_rits.json b/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_rits.json deleted file mode 100644 index 33c3e83a02..0000000000 --- a/src/unitxt/catalog/engines/classification/mixtral_8x7b_instruct_v01_rits.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "__type__": "rits_inference_engine", - "model_name": "mistralai/mixtral-8x7b-instruct-v01", - "logprobs": true, - "max_tokens": 5, - "temperature": 0.0 -} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json index b468dd9397..be6cc1b10d 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json @@ -7,7 +7,7 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false, "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.generic_inference_engine_q_c_ares" } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json index 7dc9cf47da..dd7cfc3e41 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares.json @@ -5,7 +5,7 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false, "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares" } diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json index 136027af6d..590b902a34 100644 --- a/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/llama_3_1_70b_instruct_wml_context_relevance_q_c_ares_logprobs.json @@ -5,7 +5,7 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares_logprobs", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": true, "__deprecated_msg__": "This metric should be replaced with metrics.rag.context_relevance.llama_3_1_70b_instruct_wml_q_c_ares_logprobs" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness.json b/src/unitxt/catalog/metrics/rag/answer_correctness.json index ae10ddbc33..82df6bdf95 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall.json b/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall.json index d0c80e746a..227f36004d 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.bert_score.deberta_large_mnli" + "metric": "metrics.bert_score.deberta_large_mnli", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness.bert_score_recall" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall_ml.json b/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall_ml.json index 9b225d1c41..a2f606dec0 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall_ml.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/bert_score_recall_ml.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml" + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness.bert_score_recall_ml" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_bge.json index 4c00007dce..eee60c23b8 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_bge.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_bge.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.bge_large_en_1_5" + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness.sentence_bert_bge" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_mini_lm.json index b1c6fa693d..7c6915113c 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_mini_lm.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/sentence_bert_mini_lm.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.minilm_l12_v2" + "metric": "metrics.sentence_bert.minilm_l12_v2", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness.sentence_bert_mini_lm" } diff --git a/src/unitxt/catalog/metrics/rag/answer_correctness/token_recall.json b/src/unitxt/catalog/metrics/rag/answer_correctness/token_recall.json index ae10ddbc33..9136f2a4a4 100644 --- a/src/unitxt/catalog/metrics/rag/answer_correctness/token_recall.json +++ b/src/unitxt/catalog/metrics/rag/answer_correctness/token_recall.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_correctness.token_recall" } diff --git a/src/unitxt/catalog/metrics/rag/answer_inference.json b/src/unitxt/catalog/metrics/rag/answer_inference.json index d6848030f1..84358bd144 100644 --- a/src/unitxt/catalog/metrics/rag/answer_inference.json +++ b/src/unitxt/catalog/metrics/rag/answer_inference.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.perplexity_nli.t5_nli_mixture" + "metric": "metrics.perplexity_nli.t5_nli_mixture", + "__deprecated_msg__": "This metric is deprecated" } diff --git a/src/unitxt/catalog/metrics/rag/answer_relevance/token_recall.json b/src/unitxt/catalog/metrics/rag/answer_relevance/token_recall.json index 5df7e941a3..d5fe717938 100644 --- a/src/unitxt/catalog/metrics/rag/answer_relevance/token_recall.json +++ b/src/unitxt/catalog/metrics/rag/answer_relevance/token_recall.json @@ -25,5 +25,6 @@ "to_field": "references" } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_relevance.token_recall" } diff --git a/src/unitxt/catalog/metrics/rag/answer_reward.json b/src/unitxt/catalog/metrics/rag/answer_reward.json index 77f02e6ed2..a10971373a 100644 --- a/src/unitxt/catalog/metrics/rag/answer_reward.json +++ b/src/unitxt/catalog/metrics/rag/answer_reward.json @@ -25,5 +25,6 @@ "to_field": "references" } ], - "metric": "metrics.reward.deberta_v3_large_v2" + "metric": "metrics.reward.deberta_v3_large_v2", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.answer_reward" } diff --git a/src/unitxt/catalog/metrics/rag/context_correctness.json b/src/unitxt/catalog/metrics/rag/context_correctness.json index 1406f2b03f..8aa936e346 100644 --- a/src/unitxt/catalog/metrics/rag/context_correctness.json +++ b/src/unitxt/catalog/metrics/rag/context_correctness.json @@ -14,5 +14,6 @@ "to_field": "references" } ], - "metric": "metrics.mrr" + "metric": "metrics.mrr", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_correctness" } diff --git a/src/unitxt/catalog/metrics/rag/context_correctness/map.json b/src/unitxt/catalog/metrics/rag/context_correctness/map.json index cc8e60bce5..3a5cdd6aad 100644 --- a/src/unitxt/catalog/metrics/rag/context_correctness/map.json +++ b/src/unitxt/catalog/metrics/rag/context_correctness/map.json @@ -14,5 +14,6 @@ "to_field": "references" } ], - "metric": "metrics.map" + "metric": "metrics.map", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_correctness.map" } diff --git a/src/unitxt/catalog/metrics/rag/context_correctness/mrr.json b/src/unitxt/catalog/metrics/rag/context_correctness/mrr.json index 1406f2b03f..812c16b342 100644 --- a/src/unitxt/catalog/metrics/rag/context_correctness/mrr.json +++ b/src/unitxt/catalog/metrics/rag/context_correctness/mrr.json @@ -14,5 +14,6 @@ "to_field": "references" } ], - "metric": "metrics.mrr" + "metric": "metrics.mrr", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_correctness.mrr" } diff --git a/src/unitxt/catalog/metrics/rag/context_correctness/retrieval_at_k.json b/src/unitxt/catalog/metrics/rag/context_correctness/retrieval_at_k.json index 7eb4f2a34b..813be452b1 100644 --- a/src/unitxt/catalog/metrics/rag/context_correctness/retrieval_at_k.json +++ b/src/unitxt/catalog/metrics/rag/context_correctness/retrieval_at_k.json @@ -14,5 +14,6 @@ "to_field": "references" } ], - "metric": "metrics.retrieval_at_k" + "metric": "metrics.retrieval_at_k", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_correctness.retrieval_at_k" } diff --git a/src/unitxt/catalog/metrics/rag/context_perplexity.json b/src/unitxt/catalog/metrics/rag/context_perplexity.json index 55e992bc80..5da814c61a 100644 --- a/src/unitxt/catalog/metrics/rag/context_perplexity.json +++ b/src/unitxt/catalog/metrics/rag/context_perplexity.json @@ -4,13 +4,18 @@ "preprocess_steps": [ { "__type__": "copy", - "field": "contexts", - "to_field": "references" + "field_to_field": { + "task_data/contexts": "references", + "question": "prediction" + }, + "not_exist_do_nothing": true }, { "__type__": "copy", - "field": "question", - "to_field": "prediction" + "field_to_field": { + "contexts": "references" + }, + "not_exist_do_nothing": true } ], "metric": "metrics.perplexity_q.flan_t5_small", @@ -20,5 +25,6 @@ "field": "score/instance/reference_scores", "to_field": "score/instance/score" } - ] + ], + "__deprecated_msg__": "This metric is deprecated. Use metrics.rag.external_rag.context_relevance instead." } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance.json b/src/unitxt/catalog/metrics/rag/context_relevance.json index 745437bb1f..9fedadde74 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.perplexity_q.flan_t5_small" + "metric": "metrics.perplexity_q.flan_t5_small", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_relevance" } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json index eb9b0797b6..587ab8bc0b 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares.json @@ -7,6 +7,6 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares_numeric.json index e0285e5995..113c03a400 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares_numeric.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/generic_inference_engine_q_c_ares_numeric.json @@ -7,6 +7,6 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares_numeric", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json index 909f9641f2..a25ab27136 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares.json @@ -5,6 +5,6 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json index 2c116e9ecd..8bad207bba 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_logprobs.json @@ -5,6 +5,6 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares_logprobs", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": true } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json index ed228a3adb..65e1b9e8ad 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/llama_3_1_70b_instruct_wml_q_c_ares_numeric.json @@ -5,6 +5,6 @@ "task": "tasks.rag_eval.context_relevance.binary", "format": null, "main_score": "context_relevance_q_c_ares_numeric", - "prediction_field": null, + "prediction_field": "contexts", "infer_log_probs": false } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/perplexity_flan_t5_small.json b/src/unitxt/catalog/metrics/rag/context_relevance/perplexity_flan_t5_small.json index 745437bb1f..af50bbaef0 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/perplexity_flan_t5_small.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/perplexity_flan_t5_small.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.perplexity_q.flan_t5_small" + "metric": "metrics.perplexity_q.flan_t5_small", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_relevance.perplexity_flan_t5_small" } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_bge.json index 9058871cf4..80c8ce60b8 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_bge.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_bge.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.bge_large_en_1_5" + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_relevance.sentence_bert_bge" } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_mini_lm.json index d9978a5a0f..442079c9d2 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_mini_lm.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/sentence_bert_mini_lm.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.minilm_l12_v2" + "metric": "metrics.sentence_bert.minilm_l12_v2", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_relevance.sentence_bert_mini_lm" } diff --git a/src/unitxt/catalog/metrics/rag/context_relevance/token_precision.json b/src/unitxt/catalog/metrics/rag/context_relevance/token_precision.json index a381348406..f609e4eefc 100644 --- a/src/unitxt/catalog/metrics/rag/context_relevance/token_precision.json +++ b/src/unitxt/catalog/metrics/rag/context_relevance/token_precision.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.context_relevance.token_precision" } diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall.json new file mode 100644 index 0000000000..b3ca80896e --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "answer_correctness_bert_score_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall_ml.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall_ml.json new file mode 100644 index 0000000000..15e80496ec --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/bert_score_recall_ml.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "answer_correctness_bert_score_recall_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..78573d66ce --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..a8e997be0d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..fdeb51f32c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..975f95415d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_bge.json new file mode 100644 index 0000000000..1be72d9863 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_bge.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "answer_correctness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..fed3f43b53 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/sentence_bert_mini_lm.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "answer_correctness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/token_recall.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/token_recall.json new file mode 100644 index 0000000000..2df544a34b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_correctness/token_recall.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_correctness_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/answer_reward.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/answer_reward.json new file mode 100644 index 0000000000..055dc249e9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/answer_reward.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "reward_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/question": "references", + "prediction/answer": "prediction" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.reward.deberta_v3_large_v2", + "score_prefix": "answer_relevance_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/generic_inference_engine_judge.json new file mode 100644 index 0000000000..5322836d8d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..43604aab92 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..2945ec8076 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..0b39a7f8b1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/token_recall.json b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/token_recall.json new file mode 100644 index 0000000000..a8be78860a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/answer_relevance/token_recall.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/question": "references", + "prediction/answer": "prediction" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_relevance_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/map.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/map.json new file mode 100644 index 0000000000..ce29515fe1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/map.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "map", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "task_data/reference_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.map", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/mrr.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/mrr.json new file mode 100644 index 0000000000..1f8f6c5c3f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/mrr.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "mrr", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "task_data/reference_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.mrr", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/retrieval_at_k.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/retrieval_at_k.json new file mode 100644 index 0000000000..afe17c1177 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_correctness/retrieval_at_k.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "match_at_1", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "task_data/reference_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.retrieval_at_k", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/generic_inference_engine_judge.json new file mode 100644 index 0000000000..7e0be49f74 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..da05595b96 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..ff8b926c04 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..d6162a5136 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/perplexity_flan_t5_small.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/perplexity_flan_t5_small.json new file mode 100644 index 0000000000..4786ef6690 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/perplexity_flan_t5_small.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "perplexity", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "task_data/question", + "to_field": "prediction" + } + ], + "metric": "metrics.perplexity_q.flan_t5_small", + "score_prefix": "context_relevance_perplexity_flan_t5_small_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_bge.json new file mode 100644 index 0000000000..a7f42e6b20 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_bge.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "task_data/question", + "to_field": "prediction" + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "context_relevance_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..2d2a9e92e7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/sentence_bert_mini_lm.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "task_data/question", + "to_field": "prediction" + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "context_relevance_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/token_precision.json b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/token_precision.json new file mode 100644 index 0000000000..7eec4bb7ea --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/context_relevance/token_precision.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "prediction/contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "task_data/question", + "to_field": "prediction" + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "context_relevance_token_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision.json new file mode 100644 index 0000000000..9db4232af6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "faithfulness_bert_score_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision_ml.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision_ml.json new file mode 100644 index 0000000000..eb3274676b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/bert_score_k_precision_ml.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "faithfulness_bert_score_k_precision_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..eec6488c53 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..9850609c32 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..180157e380 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..b7528e0850 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_bge.json new file mode 100644 index 0000000000..0086e1f2fd --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_bge.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "faithfulness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..5132415afc --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/sentence_bert_mini_lm.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "faithfulness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/token_k_precision.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/token_k_precision.json new file mode 100644 index 0000000000..1e60065c43 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/token_k_precision.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "faithfulness_token_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/vectara_hhem_2_1.json b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/vectara_hhem_2_1.json new file mode 100644 index 0000000000..58774e07bb --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/faithfulness/vectara_hhem_2_1.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "hhem_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "prediction/contexts": "references", + "prediction/answer": "prediction" + } + } + ], + "metric": "metrics.vectara_groundedness_hhem_2_1", + "score_prefix": "faithfulness_vectara_hhem_2_1_" +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/recommended/cpu_only/all.json b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/cpu_only/all.json new file mode 100644 index 0000000000..c25fed2814 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/cpu_only/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.end_to_end.answer_correctness.token_recall", + "metrics.rag.end_to_end.faithfulness.token_k_precision", + "metrics.rag.end_to_end.answer_relevance.token_recall", + "metrics.rag.end_to_end.context_relevance.token_precision", + "metrics.rag.end_to_end.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_azure/all.json b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_azure/all.json new file mode 100644 index 0000000000..21d65a000c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_azure/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.end_to_end.answer_correctness.gpt_4o_azure_judge", + "metrics.rag.end_to_end.faithfulness.gpt_4o_azure_judge", + "metrics.rag.end_to_end.answer_relevance.gpt_4o_azure_judge", + "metrics.rag.end_to_end.context_relevance.gpt_4o_azure_judge", + "metrics.rag.end_to_end.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_rits/all.json b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_rits/all.json new file mode 100644 index 0000000000..1978a9c7c6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_rits/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.end_to_end.answer_correctness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.end_to_end.faithfulness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.end_to_end.answer_relevance.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.end_to_end.context_relevance.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.end_to_end.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_watsonx/all.json b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_watsonx/all.json new file mode 100644 index 0000000000..2715ec4bc9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/llmaj_watsonx/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.end_to_end.answer_correctness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.end_to_end.faithfulness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.end_to_end.answer_relevance.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.end_to_end.context_relevance.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.end_to_end.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/end_to_end/recommended/small_llm/all.json b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/small_llm/all.json new file mode 100644 index 0000000000..b62497e386 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/end_to_end/recommended/small_llm/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.end_to_end.answer_correctness.bert_score_recall_ml", + "metrics.rag.end_to_end.faithfulness.vectara_hhem_2_1", + "metrics.rag.end_to_end.answer_relevance.answer_reward", + "metrics.rag.end_to_end.context_relevance.sentence_bert_mini_lm", + "metrics.rag.end_to_end.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness.json new file mode 100644 index 0000000000..78993692b6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall.json new file mode 100644 index 0000000000..eac0cd9714 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "answer_correctness_bert_score_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall_ml.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall_ml.json new file mode 100644 index 0000000000..e929f309fd --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/bert_score_recall_ml.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "answer_correctness_bert_score_recall_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..235183d486 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/generic_inference_engine_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..0e27f876d2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/gpt_4o_azure_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..1c4a085265 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..19d0101632 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_bge.json new file mode 100644 index 0000000000..911c10d3cf --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_bge.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "answer_correctness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..cb932e5596 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/sentence_bert_mini_lm.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "answer_correctness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/token_recall.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/token_recall.json new file mode 100644 index 0000000000..614aee94a3 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_correctness/token_recall.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "ground_truths": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_correctness_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/answer_reward.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/answer_reward.json new file mode 100644 index 0000000000..431a671d2d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/answer_reward.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "reward_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "question": "references", + "answer": "prediction" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.reward.deberta_v3_large_v2", + "score_prefix": "answer_relevance_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/generic_inference_engine_judge.json new file mode 100644 index 0000000000..a97967729f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/generic_inference_engine_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..66e11245c1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/gpt_4o_azure_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..9d2af894d6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..98afa80333 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/token_recall.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/token_recall.json new file mode 100644 index 0000000000..b30bcfb94b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_relevance/token_recall.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "question": "references", + "answer": "prediction" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_relevance_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/answer_reward.json b/src/unitxt/catalog/metrics/rag/external_rag/answer_reward.json new file mode 100644 index 0000000000..431a671d2d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/answer_reward.json @@ -0,0 +1,22 @@ +{ + "__type__": "metric_pipeline", + "main_score": "reward_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "question": "references", + "answer": "prediction" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.reward.deberta_v3_large_v2", + "score_prefix": "answer_relevance_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_correctness.json b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness.json new file mode 100644 index 0000000000..65b054d44b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "mrr", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "ground_truths_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.mrr", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/map.json b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/map.json new file mode 100644 index 0000000000..d5d49f25f6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/map.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "map", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "ground_truths_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.map", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/mrr.json b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/mrr.json new file mode 100644 index 0000000000..65b054d44b --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/mrr.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "mrr", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "ground_truths_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.mrr", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/retrieval_at_k.json b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/retrieval_at_k.json new file mode 100644 index 0000000000..ecade55aab --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_correctness/retrieval_at_k.json @@ -0,0 +1,19 @@ +{ + "__type__": "metric_pipeline", + "main_score": "match_at_1", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "context_ids", + "to_field": "prediction" + }, + { + "__type__": "wrap", + "field": "ground_truths_context_ids", + "inside": "list", + "to_field": "references" + } + ], + "metric": "metrics.retrieval_at_k", + "score_prefix": "context_correctness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance.json new file mode 100644 index 0000000000..f640ae4323 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "perplexity", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "question", + "to_field": "prediction" + } + ], + "metric": "metrics.perplexity_q.flan_t5_small", + "score_prefix": "context_relevance_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/generic_inference_engine_judge.json new file mode 100644 index 0000000000..5715107eee --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/generic_inference_engine_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..cdfb898998 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/gpt_4o_azure_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..3647742fde --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..3361f984e1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares_numeric", + "task": "tasks.rag_eval.context_relevance.binary", + "format": null, + "main_score": "context_relevance_judge", + "prediction_field": "contexts", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/perplexity_flan_t5_small.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/perplexity_flan_t5_small.json new file mode 100644 index 0000000000..e1dd5609b1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/perplexity_flan_t5_small.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "perplexity", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "question", + "to_field": "prediction" + } + ], + "metric": "metrics.perplexity_q.flan_t5_small", + "score_prefix": "context_relevance_perplexity_flan_t5_small_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_bge.json new file mode 100644 index 0000000000..39238a9dfd --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_bge.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "question", + "to_field": "prediction" + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "context_relevance_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..30cf8600d7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/sentence_bert_mini_lm.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "question", + "to_field": "prediction" + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "context_relevance_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/token_precision.json b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/token_precision.json new file mode 100644 index 0000000000..d9dd52098d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/context_relevance/token_precision.json @@ -0,0 +1,18 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field": "contexts", + "to_field": "references" + }, + { + "__type__": "copy", + "field": "question", + "to_field": "prediction" + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "context_relevance_token_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness.json new file mode 100644 index 0000000000..eaf54026c9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "faithfulness_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision.json new file mode 100644 index 0000000000..761a17cf6a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "faithfulness_bert_score_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision_ml.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision_ml.json new file mode 100644 index 0000000000..5c8ec27934 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/bert_score_k_precision_ml.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "faithfulness_bert_score_k_precision_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..f422bc4dff --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/generic_inference_engine_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..f88a8bbaf7 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/gpt_4o_azure_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..3e1f1ceff4 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..91e16ad8c8 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,11 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": {} +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_bge.json new file mode 100644 index 0000000000..8ccfb6ee94 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_bge.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "faithfulness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..a9a49a4cd9 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/sentence_bert_mini_lm.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "faithfulness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/token_k_precision.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/token_k_precision.json new file mode 100644 index 0000000000..db69158b30 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/token_k_precision.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "faithfulness_token_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/vectara_hhem_2_1.json b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/vectara_hhem_2_1.json new file mode 100644 index 0000000000..51c20353a3 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/faithfulness/vectara_hhem_2_1.json @@ -0,0 +1,15 @@ +{ + "__type__": "metric_pipeline", + "main_score": "hhem_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "contexts": "references", + "answer": "prediction" + } + } + ], + "metric": "metrics.vectara_groundedness_hhem_2_1", + "score_prefix": "faithfulness_vectara_hhem_2_1_" +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/recommended/cpu_only/all.json b/src/unitxt/catalog/metrics/rag/external_rag/recommended/cpu_only/all.json new file mode 100644 index 0000000000..e2cc5a65f6 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/recommended/cpu_only/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.external_rag.answer_correctness.token_recall", + "metrics.rag.external_rag.faithfulness.token_k_precision", + "metrics.rag.external_rag.answer_relevance.token_recall", + "metrics.rag.external_rag.context_relevance.token_precision", + "metrics.rag.external_rag.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_azure/all.json b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_azure/all.json new file mode 100644 index 0000000000..a2fcb6cb07 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_azure/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.external_rag.answer_correctness.gpt_4o_azure_judge", + "metrics.rag.external_rag.faithfulness.gpt_4o_azure_judge", + "metrics.rag.external_rag.answer_relevance.gpt_4o_azure_judge", + "metrics.rag.external_rag.context_relevance.gpt_4o_azure_judge", + "metrics.rag.external_rag.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_rits/all.json b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_rits/all.json new file mode 100644 index 0000000000..3bd61eee92 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_rits/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.external_rag.answer_correctness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.external_rag.faithfulness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.external_rag.answer_relevance.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.external_rag.context_relevance.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.external_rag.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_watsonx/all.json b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_watsonx/all.json new file mode 100644 index 0000000000..3283fc5356 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/recommended/llmaj_watsonx/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.external_rag.answer_correctness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.external_rag.faithfulness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.external_rag.answer_relevance.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.external_rag.context_relevance.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.external_rag.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/external_rag/recommended/small_llm/all.json b/src/unitxt/catalog/metrics/rag/external_rag/recommended/small_llm/all.json new file mode 100644 index 0000000000..717387a02f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/external_rag/recommended/small_llm/all.json @@ -0,0 +1,10 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.external_rag.answer_correctness.bert_score_recall_ml", + "metrics.rag.external_rag.faithfulness.vectara_hhem_2_1", + "metrics.rag.external_rag.answer_relevance.answer_reward", + "metrics.rag.external_rag.context_relevance.sentence_bert_mini_lm", + "metrics.rag.external_rag.context_correctness.mrr" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/faithfulness.json b/src/unitxt/catalog/metrics/rag/faithfulness.json index 5d241050ab..a8d6aae932 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision.json b/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision.json index a2ad678e9d..5795435097 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.bert_score.deberta_large_mnli" + "metric": "metrics.bert_score.deberta_large_mnli", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.bert_score_k_precision" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision_ml.json b/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision_ml.json index ce87af17ce..2ed0af2910 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision_ml.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/bert_score_k_precision_ml.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml" + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.bert_score_k_precision_ml" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_bge.json index 2f0f780124..5f8f5d0f0b 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_bge.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_bge.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.bge_large_en_1_5" + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.sentence_bert_bge" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_mini_lm.json index e12cc92215..9abf458135 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_mini_lm.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/sentence_bert_mini_lm.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.sentence_bert.minilm_l12_v2" + "metric": "metrics.sentence_bert.minilm_l12_v2", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.sentence_bert_mini_lm" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/token_k_precision.json b/src/unitxt/catalog/metrics/rag/faithfulness/token_k_precision.json index 5d241050ab..fce5571214 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/token_k_precision.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/token_k_precision.json @@ -18,5 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": "metrics.token_overlap" + "metric": "metrics.token_overlap", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.token_k_precision" } diff --git a/src/unitxt/catalog/metrics/rag/faithfulness/vectara_hhem_2_1.json b/src/unitxt/catalog/metrics/rag/faithfulness/vectara_hhem_2_1.json index 6ce69c6693..d0fe91c666 100644 --- a/src/unitxt/catalog/metrics/rag/faithfulness/vectara_hhem_2_1.json +++ b/src/unitxt/catalog/metrics/rag/faithfulness/vectara_hhem_2_1.json @@ -18,8 +18,6 @@ "not_exist_do_nothing": true } ], - "metric": { - "__type__": "faithfulness_hhem" - }, - "__description__": "Vectara's halucination detection model, HHEM2.1, compares contexts and generated answer to determine faithfulness." + "metric": "metrics.vectara_groundedness_hhem_2_1", + "__deprecated_msg__": "This metric should be replaced with metrics.rag.external_rag.faithfulness.vectara_hhem_2_1" } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall.json new file mode 100644 index 0000000000..e8673bc18f --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "answer_correctness_bert_score_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall_ml.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall_ml.json new file mode 100644 index 0000000000..d0038bfcd5 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/bert_score_recall_ml.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "answer_correctness_bert_score_recall_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..78573d66ce --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..a8e997be0d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..fdeb51f32c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..975f95415d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context_numeric", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": null, + "main_score": "answer_correctness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_bge.json new file mode 100644 index 0000000000..adebe8235c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_bge.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "answer_correctness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..f56cdd6ab2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/sentence_bert_mini_lm.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "answer_correctness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/token_recall.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/token_recall.json new file mode 100644 index 0000000000..f387ada3bc --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_correctness/token_recall.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/reference_answers": "references" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_correctness_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/answer_reward.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/answer_reward.json new file mode 100644 index 0000000000..537188c896 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/answer_reward.json @@ -0,0 +1,21 @@ +{ + "__type__": "metric_pipeline", + "main_score": "reward_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/question": "references" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.reward.deberta_v3_large_v2", + "score_prefix": "answer_relevance_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/generic_inference_engine_judge.json new file mode 100644 index 0000000000..5322836d8d --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..43604aab92 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..2945ec8076 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..0b39a7f8b1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance_numeric", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": null, + "main_score": "answer_relevance_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/token_recall.json b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/token_recall.json new file mode 100644 index 0000000000..c80c5a2d0c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/answer_relevance/token_recall.json @@ -0,0 +1,21 @@ +{ + "__type__": "metric_pipeline", + "main_score": "recall", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/question": "references" + } + }, + { + "__type__": "list_field_values", + "fields": [ + "references" + ], + "to_field": "references" + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "answer_relevance_token_recall_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_large_mnli.json b/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_large_mnli.json index 7173a94093..83a1b1f89d 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_large_mnli.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_large_mnli.json @@ -17,5 +17,6 @@ } ], "metric": "metrics.bert_score.deberta_large_mnli", - "prediction_type": "str" + "prediction_type": "str", + "__deprecated_msg__": "Metric metrics.rag.response_generation.correctness.bert_score.deberta_large_mnli is deprecated. Please use metrics.rag.response_generation.answer_correctness.bert_score_recall instead." } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_v3_base_mnli_xnli_ml.json b/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_v3_base_mnli_xnli_ml.json index 9394fb9c0b..97884f596c 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_v3_base_mnli_xnli_ml.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/correctness/bert_score/deberta_v3_base_mnli_xnli_ml.json @@ -17,5 +17,6 @@ } ], "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", - "prediction_type": "str" + "prediction_type": "str", + "__deprecated_msg__": "Metric metrics.rag.response_generation.correctness.bert_score.deberta_v3_base_mnli_xnli_ml is deprecated. Please use metrics.rag.response_generation.answer_correctness.bert_score_recall_ml instead." } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/correctness/token_overlap.json b/src/unitxt/catalog/metrics/rag/response_generation/correctness/token_overlap.json index 09282392e0..54689de4f3 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/correctness/token_overlap.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/correctness/token_overlap.json @@ -17,5 +17,6 @@ } ], "metric": "metrics.token_overlap", - "prediction_type": "str" + "prediction_type": "str", + "__deprecated_msg__": "Metric metrics.rag.response_generation.correctness.token_overlap is deprecated. Please use metrics.rag.response_generation.answer_correctness.token_recall instead." } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfullness/token_overlap.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfullness/token_overlap.json index b62cbaf4ac..84b3c36e2c 100644 --- a/src/unitxt/catalog/metrics/rag/response_generation/faithfullness/token_overlap.json +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfullness/token_overlap.json @@ -23,5 +23,6 @@ } ], "metric": "metrics.token_overlap", - "prediction_type": "str" + "prediction_type": "str", + "__deprecated_msg__": "Metric metrics.rag.response_generation.faithfullness.token_overlap is deprecated. Please use metrics.rag.response_generation.faithfulness.token_k_precision instead." } diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision.json new file mode 100644 index 0000000000..c27cf49e86 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.bert_score.deberta_large_mnli", + "score_prefix": "faithfulness_bert_score_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision_ml.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision_ml.json new file mode 100644 index 0000000000..b2baf632bc --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/bert_score_k_precision_ml.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.bert_score.deberta_v3_base_mnli_xnli_ml", + "score_prefix": "faithfulness_bert_score_k_precision_ml_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/generic_inference_engine_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/generic_inference_engine_judge.json new file mode 100644 index 0000000000..eec6488c53 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/generic_inference_engine_judge.json @@ -0,0 +1,15 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/gpt_4o_azure_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/gpt_4o_azure_judge.json new file mode 100644 index 0000000000..9850609c32 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/gpt_4o_azure_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.gpt_4o_2024_08_06_azure_openai", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_rits_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_rits_judge.json new file mode 100644 index 0000000000..180157e380 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_rits_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_rits", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json new file mode 100644 index 0000000000..b7528e0850 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/llama_3_3_70b_instruct_watsonx_judge.json @@ -0,0 +1,13 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": "engines.classification.llama_3_3_70b_instruct_watsonx", + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified_verbal", + "task": "tasks.rag_eval.faithfulness.binary", + "format": null, + "main_score": "faithfulness_judge", + "prediction_field": "answer", + "infer_log_probs": false, + "judge_to_generator_fields_mapping": { + "ground_truths": "reference_answers" + } +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_bge.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_bge.json new file mode 100644 index 0000000000..e99e18928c --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_bge.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.sentence_bert.bge_large_en_1_5", + "score_prefix": "faithfulness_sentence_bert_bge_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_mini_lm.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_mini_lm.json new file mode 100644 index 0000000000..707b96e642 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/sentence_bert_mini_lm.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "sbert_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.sentence_bert.minilm_l12_v2", + "score_prefix": "faithfulness_sentence_bert_mini_lm_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/token_k_precision.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/token_k_precision.json new file mode 100644 index 0000000000..b45389a2d2 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/token_k_precision.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "precision", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.token_overlap", + "score_prefix": "faithfulness_token_k_precision_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/vectara_hhem_2_1.json b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/vectara_hhem_2_1.json new file mode 100644 index 0000000000..d9b6981ded --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/faithfulness/vectara_hhem_2_1.json @@ -0,0 +1,14 @@ +{ + "__type__": "metric_pipeline", + "main_score": "hhem_score", + "preprocess_steps": [ + { + "__type__": "copy", + "field_to_field": { + "task_data/contexts": "references" + } + } + ], + "metric": "metrics.vectara_groundedness_hhem_2_1", + "score_prefix": "faithfulness_vectara_hhem_2_1_" +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/recommended/cpu_only/all.json b/src/unitxt/catalog/metrics/rag/response_generation/recommended/cpu_only/all.json new file mode 100644 index 0000000000..9225a4709a --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/recommended/cpu_only/all.json @@ -0,0 +1,8 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.response_generation.answer_correctness.token_recall", + "metrics.rag.response_generation.faithfulness.token_k_precision", + "metrics.rag.response_generation.answer_relevance.token_recall" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_azure/all.json b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_azure/all.json new file mode 100644 index 0000000000..24c71e076e --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_azure/all.json @@ -0,0 +1,8 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.response_generation.answer_correctness.gpt_4o_azure_judge", + "metrics.rag.response_generation.faithfulness.gpt_4o_azure_judge", + "metrics.rag.response_generation.answer_relevance.gpt_4o_azure_judge" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_rits/all.json b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_rits/all.json new file mode 100644 index 0000000000..555bdaf5e1 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_rits/all.json @@ -0,0 +1,8 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.response_generation.answer_correctness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.response_generation.faithfulness.llama_3_3_70b_instruct_rits_judge", + "metrics.rag.response_generation.answer_relevance.llama_3_3_70b_instruct_rits_judge" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_watsonx/all.json b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_watsonx/all.json new file mode 100644 index 0000000000..2f619f31a5 --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/recommended/llmaj_watsonx/all.json @@ -0,0 +1,8 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.response_generation.answer_correctness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.response_generation.faithfulness.llama_3_3_70b_instruct_watsonx_judge", + "metrics.rag.response_generation.answer_relevance.llama_3_3_70b_instruct_watsonx_judge" + ] +} diff --git a/src/unitxt/catalog/metrics/rag/response_generation/recommended/small_llm/all.json b/src/unitxt/catalog/metrics/rag/response_generation/recommended/small_llm/all.json new file mode 100644 index 0000000000..e85affceca --- /dev/null +++ b/src/unitxt/catalog/metrics/rag/response_generation/recommended/small_llm/all.json @@ -0,0 +1,8 @@ +{ + "__type__": "metrics_list", + "items": [ + "metrics.rag.response_generation.answer_correctness.bert_score_recall_ml", + "metrics.rag.response_generation.faithfulness.vectara_hhem_2_1", + "metrics.rag.response_generation.answer_relevance.answer_reward" + ] +} diff --git a/src/unitxt/catalog/metrics/vectara_groundedness_hhem_2_1.json b/src/unitxt/catalog/metrics/vectara_groundedness_hhem_2_1.json new file mode 100644 index 0000000000..d28beb6eca --- /dev/null +++ b/src/unitxt/catalog/metrics/vectara_groundedness_hhem_2_1.json @@ -0,0 +1,3 @@ +{ + "__type__": "faithfulness_hhem" +} diff --git a/src/unitxt/catalog/templates/rag_eval/answer_relevance/judge_answer_relevance_numeric.json b/src/unitxt/catalog/templates/rag_eval/answer_relevance/judge_answer_relevance_numeric.json index 0d40c59147..3310651062 100644 --- a/src/unitxt/catalog/templates/rag_eval/answer_relevance/judge_answer_relevance_numeric.json +++ b/src/unitxt/catalog/templates/rag_eval/answer_relevance/judge_answer_relevance_numeric.json @@ -10,5 +10,5 @@ ], "reference": "{number_val}", "target_prefix": "Answer: ", - "instruction": "You are given a question and a prediction from a model. Please determine whether or not the prediction answers the question.\nProvide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'.\n" + "instruction": "You are given a question and a prediction from a model. Please determine whether or not the prediction answers the question.\nThe prediction answers the question if it provides a relevant, appropriate and and complete answer to the question.\nIgnore the correctness of the prediction in your judgement.\nOn a scale of 0 to 10, to what extent does the prediction answer the question? Reply with your rating score without any preceding explanation.\n" } diff --git a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal.json b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal.json index 679a6a3c63..74d0a364a7 100644 --- a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal.json +++ b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal.json @@ -9,5 +9,5 @@ ], "reference": "{number_val}", "target_prefix": "Answer: ", - "instruction": "You are given a grounding evidence and a prediction from a model. Compare the \"Prediction\" and the \"Evidence\" to determine to what extent the prediction is grounded in the evidence.\nTo be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nReply with one of the 4 options, without any further explanations:\n\"Completely Grounded\" - if the prediction is grounded in the evidence.\n\"Mostly grounded\" - if the vast majority of the information in the prediction is grounded in the evidence, but there is a small or negligible part of the prediction which is not present in the evidence.\n\"Somewhat grounded\" - If some of the information in the prediction is grounded in the evidence.\n\"Not grounded\" - If most or all of the information in the prediction is not grounded in the evidence\n" + "instruction": "You are given a grounding evidence and a prediction from a model. Compare the \"Prediction\" and the \"Evidence\" to determine to what extent the prediction is grounded in the evidence.\nTo be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nBase your answer only on the information in the evidence. If the prediction is correct but not present in the evidence than it is not grounded.\nReply with one of the 4 options, without any further explanations:\n\"Completely Grounded\" - if the prediction is grounded in the evidence.\n\"Mostly grounded\" - if the vast majority of the information in the prediction is grounded in the evidence, but there is a small or negligible part of the prediction which is not present in the evidence.\n\"Somewhat grounded\" - If some of the information in the prediction is grounded in the evidence.\n\"Not grounded\" - If most or all of the information in the prediction is not grounded in the evidence\n" } diff --git a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal_good_bad.json b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal_good_bad.json index 9d3468b23d..ae718b8308 100644 --- a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal_good_bad.json +++ b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_no_question_simplified_verbal_good_bad.json @@ -9,5 +9,5 @@ ], "reference": "{number_val}", "target_prefix": "Answer: ", - "instruction": "You are given a grounding evidence and a prediction from a model.\nCompare the \"Prediction\" and the \"Evidence\" to determine whether or not the prediction is grounded in the evidence.\nA good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nProvide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'.\n" + "instruction": "You are given a grounding evidence and a prediction from a model.\nCompare the \"Prediction\" and the \"Evidence\" to determine whether or not the prediction is grounded in the evidence.\nA good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nBase your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded.\nProvide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'.\n" } diff --git a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal.json b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal.json index b7c210194b..cc2a81d10d 100644 --- a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal.json +++ b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal.json @@ -9,5 +9,5 @@ ], "reference": "{number_val}", "target_prefix": "Answer: ", - "instruction": "You are given a question, the corresponding evidence and a prediction from a model. Compare the \"Prediction\" and the \"Evidence\" to determine to what extent the prediction is grounded in the evidence.\nTo be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nThe question is only given for context, and is irrelevant for determining the groundedness of the prediction.\nReply with one of the 4 options, without any further explanations:\n\"Completely Grounded\" - if the prediction is grounded in the evidence.\n\"Mostly grounded\" - if the vast majority of the information in the prediction is grounded in the evidence, but there is a small or negligible part of the prediction which is not present in the evidence.\n\"Somewhat grounded\" - If some of the information in the prediction is grounded in the evidence.\n\"Not grounded\" - If most or all of the information in the prediction is not grounded in the evidence\n" + "instruction": "You are given a question, the corresponding evidence and a prediction from a model. Compare the \"Prediction\" and the \"Evidence\" to determine to what extent the prediction is grounded in the evidence.\nTo be grounded in the evidence, all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nBase your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded.\nThe question is only given for context, and is irrelevant for determining the groundedness of the prediction.\nReply with one of the 4 options, without any further explanations:\n\"Completely Grounded\" - if the prediction is grounded in the evidence.\n\"Mostly grounded\" - if the vast majority of the information in the prediction is grounded in the evidence, but there is a small or negligible part of the prediction which is not present in the evidence.\n\"Somewhat grounded\" - If some of the information in the prediction is grounded in the evidence.\n\"Not grounded\" - If most or all of the information in the prediction is not grounded in the evidence\n" } diff --git a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal_good_bad.json b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal_good_bad.json index 88443a1fcb..609d41e349 100644 --- a/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal_good_bad.json +++ b/src/unitxt/catalog/templates/rag_eval/faithfulness/judge_with_question_simplified_verbal_good_bad.json @@ -9,5 +9,5 @@ ], "reference": "{number_val}", "target_prefix": "Answer: ", - "instruction": "You are given a question, the corresponding evidence and a prediction from a model.\nCompare the \"Prediction\" and the \"Evidence\" to determine whether or not the prediction is grounded in the evidence.\nA good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nThe question is only given for context, and is irrelevant for determining the groundedness of the prediction.\nProvide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'.\n" + "instruction": "You are given a question, the corresponding evidence and a prediction from a model.\nCompare the \"Prediction\" and the \"Evidence\" to determine whether or not the prediction is grounded in the evidence.\nA good prediction is a prediction which is grounded in the evidence, i.e. all the information of the prediction must either be present in the evidence or deducible from the evidence.\n\nBase your answer only on the information in the evidence. If the prediction is correct but not present in the evidence - it is not grounded.\nThe question is only given for context, and is irrelevant for determining the groundedness of the prediction.\nProvide a rating from one of the following choices: 'Very Bad', 'Bad', 'Mediocre', 'Good', 'Very Good'. Reply using the format of [[rating]], for example: '[[Mediocre]]'.\n" } diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 1d4666fe9b..1283204cea 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -2878,7 +2878,9 @@ def _infer( """Main inference entry point.""" loop = asyncio.get_event_loop() responses = loop.run_until_complete(self._infer_async(dataset)) + return self.get_return_object(responses, return_meta_data) + def get_return_object(self, responses, return_meta_data): if return_meta_data: return responses @@ -2929,6 +2931,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "llama-3-8b-instruct": "watsonx/meta-llama/llama-3-8b-instruct", "llama-3-70b-instruct": "watsonx/meta-llama/llama-3-70b-instruct", "llama-3-1-70b-instruct": "watsonx/meta-llama/llama-3-1-70b-instruct", + "llama-3-3-70b-instruct": "watsonx/meta-llama/llama-3-3-70b-instruct", "granite-3-8b-instruct": "watsonx/ibm/granite-3-8b-instruct", "flan-t5-xxl": "watsonx/google/flan-t5-xxl", "llama-3-2-1b-instruct": "watsonx/meta-llama/llama-3-2-1b-instruct", @@ -2965,6 +2968,8 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct", "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct", "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct", + "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct", + "llama-3-1-405b-instruct-fp8": "meta-llama/llama-3-1-405b-instruct-fp8", "mistral-large-instruct": "mistralai/mistral-large-instruct-2407", "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1", }, @@ -2976,8 +2981,8 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "gpt-4o": "gpt-4o", "gpt-4o-2024-08-06": "gpt-4o-2024-08-06", "gpt-4o-2024-05-13": "gpt-4o-2024-05-13", - "gpt-4-turbo": "gpt-4-turbo", "gpt-4-turbo-preview": "gpt-4-0125-preview", + "gpt-4-turbo": "gpt-4-turbo", "gpt-4-0125-preview": "gpt-4-0125-preview", "gpt-4-1106-preview": "gpt-4-1106-preview", "gpt-3.5-turbo-1106": "gpt-3.5-turbo-1106", @@ -3007,6 +3012,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "gpt-4-32k-0613": "azure/gpt-4-32k-0613", "gpt-4-1106-preview": "azure/gpt-4-1106-preview", "gpt-4-0125-preview": "azure/gpt-4-0125-preview", + "gpt-4-turbo": "azure/gpt-4-turbo-2024-04-09", "gpt-3.5-turbo": "azure/gpt-3.5-turbo", "gpt-3.5-turbo-0301": "azure/gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613": "azure/gpt-3.5-turbo-0613", @@ -3034,6 +3040,9 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin): "rits": {"model": "model_name"}, } + def get_return_object(self, **kwargs): + return self.engine.get_return_object(kwargs) + def get_provider_name(self): return self.provider if self.provider is not None else settings.default_provider @@ -3147,6 +3156,12 @@ def _infer( dataset: Union[List[Dict[str, Any]], Dataset], return_meta_data: bool = False, ) -> Union[List[str], List[TextGenerationInferenceOutput]]: + if return_meta_data and not hasattr(self.engine, "get_return_object"): + raise NotImplementedError( + f"Inference engine {self.engine.__class__.__name__} does not support return_meta_data as it " + f"does not contain a 'get_return_object' method. Please set return_meta_data=False." + ) + inputs = [] for instance in dataset: diff --git a/src/unitxt/llm_as_judge_from_template.py b/src/unitxt/llm_as_judge_from_template.py index b04378d680..25eb85ff92 100644 --- a/src/unitxt/llm_as_judge_from_template.py +++ b/src/unitxt/llm_as_judge_from_template.py @@ -412,15 +412,15 @@ def prepare(self): # if format is not directly set in constructor, choose according to the inference model def set_format_for_inference_engine(self): model_name = self.inference_model.get_engine_id() - # TODO : better format resolution to support more chat_api options - if "rits" in model_name or "openai" in model_name: - format_name = "formats.chat_api" - elif re.search("llama.?3.*instruct", model_name): - format_name = "formats.llama3_instruct" - elif re.search("mixtral", model_name): - format_name = "formats.models.mistral.instruction" + if "_wml" in model_name: + if re.search("llama.?3.*instruct", model_name): + format_name = "formats.llama3_instruct" + elif re.search("mixtral", model_name): + format_name = "formats.models.mistral.instruction" + else: + format_name = "formats.empty" else: - format_name = "formats.empty" + format_name = "formats.chat_api" self.format = self.get_artifact(format_name) def get_full_task_name(self): @@ -459,11 +459,15 @@ def prepare_instances(self, references, predictions, task_data): judge_task_input_field, judge_task_input_field ) new_val = input_instance.get(orig_task_field_name) + if not new_val and isinstance(prediction, dict): + new_val = prediction.get(orig_task_field_name) if new_val: instance_task_data[judge_task_input_field] = new_val if self.prediction_field and prediction: - instance_task_data[self.prediction_field] = str(prediction) + if isinstance(prediction, dict): + prediction = prediction[self.prediction_field] + instance_task_data[self.prediction_field] = prediction instance_task_data = judge_task.process(instance_task_data)["input_fields"] data_classification_policy = input_instance.get("metadata", {}).get( diff --git a/src/unitxt/test_utils/metrics.py b/src/unitxt/test_utils/metrics.py index ee0d356150..25c23c3e6e 100644 --- a/src/unitxt/test_utils/metrics.py +++ b/src/unitxt/test_utils/metrics.py @@ -102,6 +102,7 @@ def test_metric( if isinstance(metric, GlobalMetric) and metric.n_resamples: metric = deepcopy(metric) metric.n_resamples = 3 # Use a low number of resamples in testing for GlobalMetric, to save runtime + outputs = apply_metric(metric, predictions, references, task_data) check_scores(