From 05fc8eb2ceebaf9a6043bb361abecea0199cb461 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 31 Oct 2024 13:58:48 +0200 Subject: [PATCH 1/4] generic llm as a judge for binary and idk Signed-off-by: Roni Friedman-Melamed --- prepare/metrics/llm_as_judge/binary_judge.py | 44 +++++++++------ .../metrics/llm_as_judge/conversation_idk.py | 56 ++++++++++--------- ...ngine_answer_correctness_q_a_gt_loose.json | 12 ++++ ...gine_answer_correctness_q_a_gt_strict.json | 12 ++++ ...inference_engine_answer_relevance_q_a.json | 12 ++++ ...nce_engine_context_relevance_q_c_ares.json | 12 ++++ ...nce_engine_correctness_holistic_q_c_a.json | 12 ++++ ...ric_inference_engine_faithfulness_c_a.json | 12 ++++ ...c_inference_engine_faithfulness_q_c_a.json | 12 ++++ .../generic_inference_judges.json | 10 ++++ 10 files changed, 152 insertions(+), 42 deletions(-) create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json create mode 100644 src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json diff --git a/prepare/metrics/llm_as_judge/binary_judge.py b/prepare/metrics/llm_as_judge/binary_judge.py index 58cfb280d3..fe237faee2 100644 --- a/prepare/metrics/llm_as_judge/binary_judge.py +++ b/prepare/metrics/llm_as_judge/binary_judge.py @@ -2,6 +2,7 @@ from unitxt.llm_as_judge import ( TaskBasedLLMasJudge, ) +from unitxt.inference import GenericInferenceEngine metric_type_to_template_dict = { "faithfulness": { @@ -17,6 +18,12 @@ "answer_relevance": {"q_a": "judge_answer_relevance"}, } +generic_engine_label = "generic_inference_engine" + +inference_models = { + "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml", + generic_engine_label: GenericInferenceEngine() +} def get_prediction_field(metric_type): return None if metric_type == "context_relevance" else "answer" @@ -27,20 +34,23 @@ def get_prediction_field(metric_type): task_name = f"tasks.rag_eval.{metric_type}.binary" for use_logprobs in [True, False]: - logprobs_label = "_logprobs" if use_logprobs else "" - metric_label = f"{metric_type}_{template_short_name}{logprobs_label}" - metric = TaskBasedLLMasJudge( - inference_model="engines.classification.llama_3_1_70b_instruct_wml", - template=f"templates.rag_eval.{metric_type}.{template_name}{logprobs_label}", - task=task_name, - format="formats.empty", - main_score=metric_label, - prediction_field=get_prediction_field(metric_type), - infer_log_probs=use_logprobs, - ) - - add_to_catalog( - metric, - f"metrics.llm_as_judge.binary.llama_3_1_70b_instruct_wml_{metric_label}", - overwrite=True, - ) + for inf_label,inference_model in inference_models.items(): + if use_logprobs and inf_label == generic_engine_label: # engine GenericInferenceEngine does not support logprobs + continue + logprobs_label = "_logprobs" if use_logprobs else "" + metric_label = f"{metric_type}_{template_short_name}{logprobs_label}" + metric = TaskBasedLLMasJudge( + inference_model=inference_model, + template=f"templates.rag_eval.{metric_type}.{template_name}{logprobs_label}", + task=task_name, + format="formats.empty", + main_score=metric_label, + prediction_field=get_prediction_field(metric_type), + infer_log_probs=use_logprobs, + ) + + add_to_catalog( + metric, + f"metrics.llm_as_judge.binary.{inf_label}_{metric_label}", + overwrite=True, + ) diff --git a/prepare/metrics/llm_as_judge/conversation_idk.py b/prepare/metrics/llm_as_judge/conversation_idk.py index 71516e96e5..502ff707f6 100644 --- a/prepare/metrics/llm_as_judge/conversation_idk.py +++ b/prepare/metrics/llm_as_judge/conversation_idk.py @@ -2,36 +2,42 @@ from unitxt.inference import ( IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParams, + GenericInferenceEngine, ) from unitxt.llm_as_judge import LLMAsJudge -platform = "ibm_gen_ai" -gen_params = IbmGenAiInferenceEngineParams(max_new_tokens=256) - -model_name = "meta-llama/llama-3-70b-instruct" template_name = "templates.response_assessment.judges.idk.v1" -inference_model = IbmGenAiInferenceEngine(model_name=model_name, parameters=gen_params) - -model_label = model_name.split("/")[1].replace("-", "") -template_label = template_name.split(".")[-1] - -metric_label = ( - "metrics.llm_as_judge.rating." + model_label + "_template_" + template_label -) - -cur_metric = LLMAsJudge( - inference_model=inference_model, - template=template_name, - task="rating.single_turn", - main_score=metric_label, - prediction_type="str", -) +inference_models = { + "llama3_v1_ibmgenai" : { + "model_name": "llama370binstruct", + "inference_model": IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", + parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256))}, + "generic_inference_engine": { + "model_name" :"generic", + "inference_model" : (GenericInferenceEngine()) + } +} + +for label,inference_model in inference_models.items(): + model_label = inference_model["model_name"] + template_label = template_name.split(".")[-1] + metric_label = ( + "metrics.llm_as_judge.rating." + model_label + "_template_" + template_label + ) + + cur_metric = LLMAsJudge( + inference_model=inference_model["inference_model"], + template=template_name, + task="rating.single_turn", + main_score=metric_label, + prediction_type="str", + ) # _description__= "Does the model response say I don't know?" -add_to_catalog( - cur_metric, - "metrics.llm_as_judge.conversation_answer_idk.llama3_v1_ibmgenai_judges", - overwrite=True, -) + add_to_catalog( + cur_metric, + f"metrics.llm_as_judge.conversation_answer_idk.{label}_judges", + overwrite=True, + ) diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json new file mode 100644 index 0000000000..7d05ec90ef --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_loose.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_correctness.judge_loose_match_no_context", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": "formats.empty", + "main_score": "answer_correctness_q_a_gt_loose", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json new file mode 100644 index 0000000000..741c8f85e7 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_correctness_q_a_gt_strict.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_correctness.judge_simplified_format", + "task": "tasks.rag_eval.answer_correctness.binary", + "format": "formats.empty", + "main_score": "answer_correctness_q_a_gt_strict", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json new file mode 100644 index 0000000000..fdc8511c08 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_answer_relevance_q_a.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.answer_relevance.judge_answer_relevance", + "task": "tasks.rag_eval.answer_relevance.binary", + "format": "formats.empty", + "main_score": "answer_relevance_q_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json new file mode 100644 index 0000000000..3c8f9f4f29 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_context_relevance_q_c_ares.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.context_relevance.judge_context_relevance_ares", + "task": "tasks.rag_eval.context_relevance.binary", + "format": "formats.empty", + "main_score": "context_relevance_q_c_ares", + "prediction_field": null, + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json new file mode 100644 index 0000000000..07046c02e9 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_correctness_holistic_q_c_a.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.correctness_holistic.judge_correctness_simple", + "task": "tasks.rag_eval.correctness_holistic.binary", + "format": "formats.empty", + "main_score": "correctness_holistic_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json new file mode 100644 index 0000000000..424757fd0a --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_c_a.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.faithfulness.judge_no_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": "formats.empty", + "main_score": "faithfulness_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json new file mode 100644 index 0000000000..e0728afda8 --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/binary/generic_inference_engine_faithfulness_q_c_a.json @@ -0,0 +1,12 @@ +{ + "__type__": "task_based_ll_mas_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.rag_eval.faithfulness.judge_with_question_simplified", + "task": "tasks.rag_eval.faithfulness.binary", + "format": "formats.empty", + "main_score": "faithfulness_q_c_a", + "prediction_field": "answer", + "infer_log_probs": false +} diff --git a/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json new file mode 100644 index 0000000000..2c32f1b56a --- /dev/null +++ b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json @@ -0,0 +1,10 @@ +{ + "__type__": "llm_as_judge", + "inference_model": { + "__type__": "generic_inference_engine" + }, + "template": "templates.response_assessment.judges.idk.v1", + "task": "rating.single_turn", + "main_score": "metrics.llm_as_judge.rating.generic_template_v1", + "prediction_type": "str" +} From f4cd6b258c6d84c1cd02b7f394b6049519f528f5 Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 31 Oct 2024 14:01:20 +0200 Subject: [PATCH 2/4] unify generic inference label Signed-off-by: Roni Friedman-Melamed --- ...inference_judges.json => generic_inference_engine_judges.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/{generic_inference_judges.json => generic_inference_engine_judges.json} (100%) diff --git a/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json b/src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_engine_judges.json similarity index 100% rename from src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_judges.json rename to src/unitxt/catalog/metrics/llm_as_judge/conversation_answer_idk/generic_inference_engine_judges.json From ce5287048dd68e2bbf4f03b698d8564b82038c3f Mon Sep 17 00:00:00 2001 From: Roni Friedman-Melamed Date: Thu, 31 Oct 2024 15:12:11 +0200 Subject: [PATCH 3/4] ruff Signed-off-by: Roni Friedman-Melamed --- prepare/metrics/llm_as_judge/binary_judge.py | 11 ++++++---- .../metrics/llm_as_judge/conversation_idk.py | 21 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/prepare/metrics/llm_as_judge/binary_judge.py b/prepare/metrics/llm_as_judge/binary_judge.py index fe237faee2..9e33d4d70d 100644 --- a/prepare/metrics/llm_as_judge/binary_judge.py +++ b/prepare/metrics/llm_as_judge/binary_judge.py @@ -1,8 +1,8 @@ from unitxt import add_to_catalog +from unitxt.inference import GenericInferenceEngine from unitxt.llm_as_judge import ( TaskBasedLLMasJudge, ) -from unitxt.inference import GenericInferenceEngine metric_type_to_template_dict = { "faithfulness": { @@ -22,9 +22,10 @@ inference_models = { "llama_3_1_70b_instruct_wml": "engines.classification.llama_3_1_70b_instruct_wml", - generic_engine_label: GenericInferenceEngine() + generic_engine_label: GenericInferenceEngine(), } + def get_prediction_field(metric_type): return None if metric_type == "context_relevance" else "answer" @@ -34,8 +35,10 @@ def get_prediction_field(metric_type): task_name = f"tasks.rag_eval.{metric_type}.binary" for use_logprobs in [True, False]: - for inf_label,inference_model in inference_models.items(): - if use_logprobs and inf_label == generic_engine_label: # engine GenericInferenceEngine does not support logprobs + for inf_label, inference_model in inference_models.items(): + if ( + use_logprobs and inf_label == generic_engine_label + ): # engine GenericInferenceEngine does not support logprobs continue logprobs_label = "_logprobs" if use_logprobs else "" metric_label = f"{metric_type}_{template_short_name}{logprobs_label}" diff --git a/prepare/metrics/llm_as_judge/conversation_idk.py b/prepare/metrics/llm_as_judge/conversation_idk.py index 502ff707f6..48ae49b8f9 100644 --- a/prepare/metrics/llm_as_judge/conversation_idk.py +++ b/prepare/metrics/llm_as_judge/conversation_idk.py @@ -1,25 +1,28 @@ from unitxt import add_to_catalog from unitxt.inference import ( + GenericInferenceEngine, IbmGenAiInferenceEngine, IbmGenAiInferenceEngineParams, - GenericInferenceEngine, ) from unitxt.llm_as_judge import LLMAsJudge template_name = "templates.response_assessment.judges.idk.v1" inference_models = { - "llama3_v1_ibmgenai" : { + "llama3_v1_ibmgenai": { "model_name": "llama370binstruct", - "inference_model": IbmGenAiInferenceEngine(model_name="meta-llama/llama-3-70b-instruct", - parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256))}, + "inference_model": IbmGenAiInferenceEngine( + model_name="meta-llama/llama-3-70b-instruct", + parameters=IbmGenAiInferenceEngineParams(max_new_tokens=256), + ), + }, "generic_inference_engine": { - "model_name" :"generic", - "inference_model" : (GenericInferenceEngine()) - } + "model_name": "generic", + "inference_model": (GenericInferenceEngine()), + }, } -for label,inference_model in inference_models.items(): +for label, inference_model in inference_models.items(): model_label = inference_model["model_name"] template_label = template_name.split(".")[-1] metric_label = ( @@ -34,7 +37,7 @@ prediction_type="str", ) -# _description__= "Does the model response say I don't know?" + # _description__= "Does the model response say I don't know?" add_to_catalog( cur_metric, From 53ce0eddf027cc57661d2982f75f38abf5f1c28a Mon Sep 17 00:00:00 2001 From: lilacheden Date: Mon, 4 Nov 2024 15:43:33 +0200 Subject: [PATCH 4/4] fix processor bug Signed-off-by: lilacheden --- prepare/processors/processors.py | 2 +- .../processors/cast_to_float_return_0_5_if_failed.json | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py index 87a8299430..2cf29bfbe1 100644 --- a/prepare/processors/processors.py +++ b/prepare/processors/processors.py @@ -221,7 +221,7 @@ ) add_to_catalog( - PostProcess(Cast(to="float", failure_default={"float": 0.5})), + PostProcess(Cast(to="float", failure_default=0.5)), "processors.cast_to_float_return_0_5_if_failed", overwrite=True, ) diff --git a/src/unitxt/catalog/processors/cast_to_float_return_0_5_if_failed.json b/src/unitxt/catalog/processors/cast_to_float_return_0_5_if_failed.json index 2b9024f977..ad424b45f4 100644 --- a/src/unitxt/catalog/processors/cast_to_float_return_0_5_if_failed.json +++ b/src/unitxt/catalog/processors/cast_to_float_return_0_5_if_failed.json @@ -3,8 +3,6 @@ "operator": { "__type__": "cast", "to": "float", - "failure_default": { - "float": 0.5 - } + "failure_default": 0.5 } }