forked from Azure/azure-sdk-for-python
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update built-in chat eval flow (Azure#34896)
* update built-in chat eval flow * move groundedness to groundedness service * add fallback logic when groundedness service is not available in the region * change reason column name to <metric>_reason * add flight control to flow input * update _metric_handler.py for built-in chat flow * add fallback_groundedness_evaluation node to chat node list * keep evaluation_per_turn column in output * update e2e tests of chat evaluation flow * update built-in qa evaluation flow * fix fallback logic check in valida_service * add e2e test of fallingback groundednes
- Loading branch information
Showing
22 changed files
with
754 additions
and
327 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
15 changes: 15 additions & 0 deletions
15
...re/ai/generative/evaluate/pf_templates/built_in_metrics/chat/call_groundedness_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from promptflow import tool | ||
from rai_client import RAIServiceHandler | ||
|
||
|
||
@tool | ||
def call_groundedness_service(request_bodies: list[dict]) -> [dict]: | ||
service_handler = RAIServiceHandler() | ||
annotation_results = [] | ||
for request_body in request_bodies: | ||
try: | ||
annotation_result = service_handler.get_annotation(request_body) | ||
except Exception: | ||
annotation_result = [] | ||
annotation_results += annotation_result | ||
return annotation_results |
73 changes: 3 additions & 70 deletions
73
...ative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/call_rai_service.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,76 +1,9 @@ | ||
from promptflow import tool | ||
from mlflow.utils.rest_utils import http_request | ||
import time | ||
from utils import get_cred | ||
from constants import RAIService | ||
from rai_client import RAIServiceHandler | ||
|
||
|
||
def submit_annotation(cred, request_body): | ||
try: | ||
response = http_request( | ||
host_creds=cred, | ||
endpoint="/submitannotation", | ||
method="POST", | ||
json=request_body, | ||
) | ||
if response.status_code != 202: | ||
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], response.text) | ||
response.raise_for_status() | ||
except AttributeError as e: | ||
response = None | ||
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], e) | ||
if response is not None: | ||
json_obj = response.json() | ||
else: | ||
json_obj = {} | ||
return json_obj | ||
|
||
def check_status(cred, request_id): | ||
try: | ||
response = http_request( | ||
host_creds = cred, | ||
endpoint="/operations/" + request_id, | ||
method="GET" | ||
) | ||
except AttributeError as e: | ||
response = None | ||
return response | ||
|
||
def retrieve_annotation_result(cred, submitannotation_response): | ||
request_id = submitannotation_response["location"].split("/")[-1] | ||
annotation_result = None | ||
start = time.time() | ||
time_elapsed = 0 | ||
request_count = 1 | ||
while True and time_elapsed <= RAIService.TIMEOUT: | ||
try: | ||
request_status = check_status(cred, request_id) | ||
except Exception: | ||
request_status = None | ||
if request_status: | ||
request_status_code = request_status.status_code | ||
if request_status_code == 200: | ||
annotation_result = request_status.json() | ||
break | ||
else: | ||
print("Failed to retrieve the status of RequestID: %s" % request_id) | ||
request_count += 1 | ||
sleep_time = RAIService.SLEEPTIME ** request_count | ||
time.sleep(sleep_time) | ||
time_elapsed = time.time() - start | ||
|
||
if time_elapsed > RAIService.TIMEOUT: | ||
raise TimeoutError("Request times out after %d seconds", RAIService.TIMEOUT) | ||
|
||
return annotation_result | ||
|
||
# The inputs section will change based on the arguments of the tool function, after you save the code | ||
# Adding type to arguments and return value will help the system show the types properly | ||
# Please update the function name/signature per need | ||
@tool | ||
def call_rai_service(request_body: dict) -> dict: | ||
cred = get_cred() | ||
submitannotation_response = submit_annotation(cred, request_body) | ||
annotation_result = retrieve_annotation_result(cred, submitannotation_response) | ||
service_handler = RAIServiceHandler() | ||
annotation_result = service_handler.get_annotation(request_body) | ||
return annotation_result | ||
|
74 changes: 49 additions & 25 deletions
74
...ve/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/concatenate_metrics.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,67 +1,91 @@ | ||
from promptflow import tool | ||
import numpy as np | ||
import constants | ||
|
||
def format_rag_results(rag_results: dict, supported_metrics): | ||
|
||
def format_rag_results(rag_results: dict, | ||
selected_metrics: dict, | ||
num_turns: int): | ||
result_per_chat = {} | ||
result_per_turn = {} | ||
supported_metrics = selected_metrics["rag_metrics"] | ||
if rag_results: | ||
for metric, value in rag_results['artifacts'].items(): | ||
try: | ||
result_per_chat[metric] = rag_results['metrics']["mean_" + metric] | ||
result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']} | ||
result_per_chat[metric] = round( | ||
rag_results['metrics']["mean_" + metric], | ||
2) | ||
result_per_turn[metric] = {"reason": value['reason'][0], | ||
"score": value['score_per_turn'][0]} | ||
except KeyError: | ||
result_per_chat[metric] = np.nan | ||
result_per_turn[metric] = np.nan | ||
result_per_turn[metric] = {"score": [np.nan] * int(num_turns)} | ||
for metric in supported_metrics: | ||
if metric not in result_per_turn: | ||
result_per_chat[metric] = np.nan | ||
result_per_turn[metric] = np.nan | ||
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat} | ||
return {"results_per_turn": result_per_turn, | ||
"results_per_chat": result_per_chat} | ||
|
||
|
||
def format_non_rag_results(non_rag_results: dict, supported_metrics): | ||
def format_non_rag_results(non_rag_results: dict, | ||
selected_metrics: dict, | ||
num_turns: int): | ||
result_per_chat = {} | ||
result_per_turn = {} | ||
supported_metrics = selected_metrics["non_rag_metrics"] | ||
if non_rag_results: | ||
for metric in non_rag_results['artifacts']: | ||
try: | ||
result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric] | ||
except: | ||
result_per_chat[metric] = round( | ||
non_rag_results['metrics']['mean_' + metric], | ||
2) | ||
result_per_turn[metric] = { | ||
"score": non_rag_results['artifacts'][metric]} | ||
except Exception: | ||
result_per_chat[metric] = np.nan | ||
result_per_turn = non_rag_results['artifacts'] | ||
result_per_turn[metric] = { | ||
"score": [np.nan] * int(num_turns)} | ||
|
||
for metric in supported_metrics: | ||
if metric not in result_per_turn: | ||
result_per_turn[metric] = np.nan | ||
result_per_chat[metric] = np.nan | ||
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat} | ||
return {"results_per_turn": result_per_turn, | ||
"results_per_chat": result_per_chat} | ||
|
||
def format_safety_results(safety_results: dict, supported_metrics): | ||
|
||
def format_safety_results(safety_results: dict, selected_metrics): | ||
result_per_chat = {} | ||
supported_metrics = selected_metrics["safety_metrics"] | ||
if safety_results: | ||
result_per_chat = safety_results | ||
for metric in supported_metrics: | ||
if metric not in result_per_chat: | ||
result_per_chat[metric] = np.nan | ||
result_per_chat[metric + "_reasoning"] = np.nan | ||
result_per_chat[metric + "_reason"] = np.nan | ||
result_per_chat[metric + "_score"] = np.nan | ||
return result_per_chat | ||
|
||
# The inputs section will change based on the arguments of the tool function, after you save the code | ||
# Adding type to arguments and return value will help the system show the types properly | ||
# Please update the function name/signature per need | ||
|
||
@tool | ||
def concatenate_metrics(rag_results: dict, non_rag_results: dict, | ||
safety_results: dict, | ||
selected_metrics: dict) -> dict: | ||
formatted_rag = format_rag_results(rag_results, selected_metrics['rag_metrics']) | ||
formatted_non_rag = format_non_rag_results(non_rag_results, selected_metrics['non_rag_metrics']) | ||
formatted_safety = format_safety_results(safety_results, selected_metrics['safety_metrics']) | ||
def concatenate_metrics(rag_results: dict, non_rag_results: dict, | ||
safety_results: dict, | ||
groundedness_results: list[dict], | ||
selected_metrics: dict, | ||
chat_validation: dict) -> dict: | ||
num_turns = chat_validation["num_turns"] | ||
formatted_rag = format_rag_results(rag_results, | ||
selected_metrics, | ||
num_turns) | ||
formatted_non_rag = format_non_rag_results(non_rag_results, | ||
selected_metrics, | ||
num_turns) | ||
formatted_safety = format_safety_results(safety_results, | ||
selected_metrics) | ||
results = {} | ||
for key in ["results_per_turn", "results_per_chat"]: | ||
result_concat = formatted_rag[key].copy() | ||
result_concat.update(formatted_non_rag[key]) | ||
result_concat.update(groundedness_results[key]) | ||
if key == "results_per_chat": | ||
result_concat.update(formatted_safety) | ||
results[key] = result_concat | ||
return results | ||
return results |
22 changes: 14 additions & 8 deletions
22
...i-generative/azure/ai/generative/evaluate/pf_templates/built_in_metrics/chat/constants.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,39 +1,45 @@ | ||
from azureml.metrics import constants | ||
from enum import Enum | ||
|
||
|
||
class RAIService: | ||
"""Define constants related to RAI service""" | ||
TIMEOUT = 1800 | ||
TIMEOUT = 120 | ||
SLEEPTIME = 2 | ||
HARM_SEVERITY_THRESHOLD = 4 | ||
|
||
|
||
class Metric: | ||
"""Defines all metrics supported by RAI service""" | ||
# Content harm | ||
SelfHarm = "self_harm" | ||
Violence = "violence" | ||
Sexual = "sexual" | ||
HateFairness = "hate_fairness" | ||
HateUnfairness = "hate_unfairness" | ||
|
||
|
||
class Tasks: | ||
"""Defines types of annotation tasks supported by RAI Service.""" | ||
CONTENT_HARM = "content harm" | ||
|
||
|
||
class HarmSeverityLevel(Enum): | ||
Safe = 0 | ||
Low = 1 | ||
Medium = 2 | ||
High = 3 | ||
VeryLow = "Very low" | ||
Low = "Low" | ||
Medium = "Medium" | ||
High = "High" | ||
|
||
|
||
# Content harm metric set | ||
CONTENT_HARM_METRICS = { | ||
Metric.SelfHarm, | ||
Metric.Violence, | ||
Metric.Sexual, | ||
Metric.HateFairness | ||
Metric.HateUnfairness | ||
} | ||
|
||
|
||
RAG_EVALUATION_SET = constants.Metric.RAG_EVALUATION_SET | ||
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence, constants.Metric.GPTFluency} | ||
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence, | ||
constants.Metric.GPTFluency} | ||
QUALITY_METRIC_SET = RAG_EVALUATION_SET | NON_RAG_EVALUATION_SET |
42 changes: 42 additions & 0 deletions
42
.../generative/evaluate/pf_templates/built_in_metrics/chat/construct_groundedness_request.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from promptflow import tool | ||
import json | ||
|
||
|
||
def normalize_user_text(user_text): | ||
return user_text.replace("'", "\\\"") | ||
|
||
|
||
def construct_single_request(question: str, | ||
answer: str, | ||
context: dict = None) -> dict: | ||
metrics = ["generic_groundedness"] | ||
user_text = json.dumps({ | ||
"question": question, | ||
"answer": answer, | ||
"context": context}) | ||
parsed_user_text = normalize_user_text(user_text) | ||
request_body = {"UserTextList": [parsed_user_text], | ||
"AnnotationTask": "groundedness", | ||
"MetricList": metrics} | ||
return request_body | ||
|
||
|
||
@tool | ||
def construct_groundedness_requests(parsed_chat: dict) -> str: | ||
num_turns = len(parsed_chat["questions"]) | ||
request_bodies = [] | ||
for i in range(num_turns): | ||
question = parsed_chat["questions"][i] | ||
answer = parsed_chat["answers"][i] | ||
try: | ||
retrieved_documents = eval( | ||
parsed_chat["retrieved_documents"][i]) | ||
except Exception: | ||
retrieved_documents = [ | ||
parsed_chat["retrieved_documents"][i]] | ||
context = {"citations": retrieved_documents} | ||
request = construct_single_request(question, | ||
answer, | ||
context) | ||
request_bodies.append(request) | ||
return request_bodies |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.