Skip to content

Commit

Permalink
update built-in chat eval flow (Azure#34896)
Browse files Browse the repository at this point in the history
* update built-in chat eval flow

* move groundedness to groundedness service
* add fallback logic when groundedness service is not available in the
region
* change reason column name to <metric>_reason
* add flight control to flow input

* update _metric_handler.py for built-in chat flow

* add fallback_groundedness_evaluation node to chat node list
* keep evaluation_per_turn column in output

* update e2e tests of chat evaluation flow

* update built-in qa evaluation flow

* fix fallback logic check in valida_service
* add e2e test of fallingback groundednes
  • Loading branch information
qusongms authored Mar 26, 2024
1 parent 9973a20 commit b03b460
Show file tree
Hide file tree
Showing 22 changed files with 754 additions and 327 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

NODE_LIST_BY_TASK = {
"qa": ["gpt_coherence", "gpt_similarity", "gpt_relevance", "gpt_fluency", "gpt_groundedness"],
"chat": ["evaluate_chat_rag", "evaluate_coherence_fluency"],
"chat": ["evaluate_chat_rag", "evaluate_coherence_fluency", "fallback_groundedness_evaluation"],
}


Expand Down Expand Up @@ -138,6 +138,9 @@ def calculate_metrics(self) -> Dict:
if col.replace("outputs.", "").startswith(metric):
is_col_to_delete = False
break
# keep the column "evaluation_per_turn" in the output
if "evaluation_per_turn" in col:
is_col_to_delete = False
if is_col_to_delete:
columns_to_drop.append(col)
result_df.drop(columns_to_drop, axis=1, inplace=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@

@tool
def aggregate_variants_results(results: List[dict], metrics: List[dict],
thresholds = None):
if metrics:
thresholds=None):
if metrics:
rag_metrics = filter_metrics(metrics[0]["rag_metrics"])
non_rag_metrics = filter_metrics(metrics[0]["non_rag_metrics"])
safety_metrics = filter_metrics(metrics[0]["safety_metrics"])
else:
rag_metrics = []
non_rag_metrics = []
safety_metrics = []
selected_quality_metrics = rag_metrics + non_rag_metrics
selected_quality_metrics = rag_metrics + non_rag_metrics

# set threshold for safety metric defect_rate
if thresholds != [] and thresholds is not None:
Expand Down Expand Up @@ -51,7 +51,8 @@ def aggregate_variants_results(results: List[dict], metrics: List[dict],
if name in selected_quality_metrics:
aggregate_output[metric_name] = round(np.nanmean(values), 2)
elif name in safety_metrics:
aggregate_output[metric_name] = round(np.sum(values >= threshold) / len(values), 2)
aggregate_output[metric_name] = round(
np.sum(values >= threshold) / len(values), 2)
else:
aggregate_output[metric_name] = np.nan
log_metric(metric_name, aggregate_output[metric_name])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from promptflow import tool
from rai_client import RAIServiceHandler


@tool
def call_groundedness_service(request_bodies: list[dict]) -> [dict]:
service_handler = RAIServiceHandler()
annotation_results = []
for request_body in request_bodies:
try:
annotation_result = service_handler.get_annotation(request_body)
except Exception:
annotation_result = []
annotation_results += annotation_result
return annotation_results
Original file line number Diff line number Diff line change
@@ -1,76 +1,9 @@
from promptflow import tool
from mlflow.utils.rest_utils import http_request
import time
from utils import get_cred
from constants import RAIService
from rai_client import RAIServiceHandler


def submit_annotation(cred, request_body):
try:
response = http_request(
host_creds=cred,
endpoint="/submitannotation",
method="POST",
json=request_body,
)
if response.status_code != 202:
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], response.text)
response.raise_for_status()
except AttributeError as e:
response = None
print("Fail evaluating '%s' with error message: %s", request_body["UserTextList"], e)
if response is not None:
json_obj = response.json()
else:
json_obj = {}
return json_obj

def check_status(cred, request_id):
try:
response = http_request(
host_creds = cred,
endpoint="/operations/" + request_id,
method="GET"
)
except AttributeError as e:
response = None
return response

def retrieve_annotation_result(cred, submitannotation_response):
request_id = submitannotation_response["location"].split("/")[-1]
annotation_result = None
start = time.time()
time_elapsed = 0
request_count = 1
while True and time_elapsed <= RAIService.TIMEOUT:
try:
request_status = check_status(cred, request_id)
except Exception:
request_status = None
if request_status:
request_status_code = request_status.status_code
if request_status_code == 200:
annotation_result = request_status.json()
break
else:
print("Failed to retrieve the status of RequestID: %s" % request_id)
request_count += 1
sleep_time = RAIService.SLEEPTIME ** request_count
time.sleep(sleep_time)
time_elapsed = time.time() - start

if time_elapsed > RAIService.TIMEOUT:
raise TimeoutError("Request times out after %d seconds", RAIService.TIMEOUT)

return annotation_result

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# Please update the function name/signature per need
@tool
def call_rai_service(request_body: dict) -> dict:
cred = get_cred()
submitannotation_response = submit_annotation(cred, request_body)
annotation_result = retrieve_annotation_result(cred, submitannotation_response)
service_handler = RAIServiceHandler()
annotation_result = service_handler.get_annotation(request_body)
return annotation_result

Original file line number Diff line number Diff line change
@@ -1,67 +1,91 @@
from promptflow import tool
import numpy as np
import constants

def format_rag_results(rag_results: dict, supported_metrics):

def format_rag_results(rag_results: dict,
selected_metrics: dict,
num_turns: int):
result_per_chat = {}
result_per_turn = {}
supported_metrics = selected_metrics["rag_metrics"]
if rag_results:
for metric, value in rag_results['artifacts'].items():
try:
result_per_chat[metric] = rag_results['metrics']["mean_" + metric]
result_per_turn[metric] = {"reason": value['reason'], "score": value['score_per_turn']}
result_per_chat[metric] = round(
rag_results['metrics']["mean_" + metric],
2)
result_per_turn[metric] = {"reason": value['reason'][0],
"score": value['score_per_turn'][0]}
except KeyError:
result_per_chat[metric] = np.nan
result_per_turn[metric] = np.nan
result_per_turn[metric] = {"score": [np.nan] * int(num_turns)}
for metric in supported_metrics:
if metric not in result_per_turn:
result_per_chat[metric] = np.nan
result_per_turn[metric] = np.nan
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
return {"results_per_turn": result_per_turn,
"results_per_chat": result_per_chat}


def format_non_rag_results(non_rag_results: dict, supported_metrics):
def format_non_rag_results(non_rag_results: dict,
selected_metrics: dict,
num_turns: int):
result_per_chat = {}
result_per_turn = {}
supported_metrics = selected_metrics["non_rag_metrics"]
if non_rag_results:
for metric in non_rag_results['artifacts']:
try:
result_per_chat[metric] = non_rag_results['metrics']['mean_' + metric]
except:
result_per_chat[metric] = round(
non_rag_results['metrics']['mean_' + metric],
2)
result_per_turn[metric] = {
"score": non_rag_results['artifacts'][metric]}
except Exception:
result_per_chat[metric] = np.nan
result_per_turn = non_rag_results['artifacts']
result_per_turn[metric] = {
"score": [np.nan] * int(num_turns)}

for metric in supported_metrics:
if metric not in result_per_turn:
result_per_turn[metric] = np.nan
result_per_chat[metric] = np.nan
return {"results_per_turn": result_per_turn, "results_per_chat": result_per_chat}
return {"results_per_turn": result_per_turn,
"results_per_chat": result_per_chat}

def format_safety_results(safety_results: dict, supported_metrics):

def format_safety_results(safety_results: dict, selected_metrics):
result_per_chat = {}
supported_metrics = selected_metrics["safety_metrics"]
if safety_results:
result_per_chat = safety_results
for metric in supported_metrics:
if metric not in result_per_chat:
result_per_chat[metric] = np.nan
result_per_chat[metric + "_reasoning"] = np.nan
result_per_chat[metric + "_reason"] = np.nan
result_per_chat[metric + "_score"] = np.nan
return result_per_chat

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# Please update the function name/signature per need

@tool
def concatenate_metrics(rag_results: dict, non_rag_results: dict,
safety_results: dict,
selected_metrics: dict) -> dict:
formatted_rag = format_rag_results(rag_results, selected_metrics['rag_metrics'])
formatted_non_rag = format_non_rag_results(non_rag_results, selected_metrics['non_rag_metrics'])
formatted_safety = format_safety_results(safety_results, selected_metrics['safety_metrics'])
def concatenate_metrics(rag_results: dict, non_rag_results: dict,
safety_results: dict,
groundedness_results: list[dict],
selected_metrics: dict,
chat_validation: dict) -> dict:
num_turns = chat_validation["num_turns"]
formatted_rag = format_rag_results(rag_results,
selected_metrics,
num_turns)
formatted_non_rag = format_non_rag_results(non_rag_results,
selected_metrics,
num_turns)
formatted_safety = format_safety_results(safety_results,
selected_metrics)
results = {}
for key in ["results_per_turn", "results_per_chat"]:
result_concat = formatted_rag[key].copy()
result_concat.update(formatted_non_rag[key])
result_concat.update(groundedness_results[key])
if key == "results_per_chat":
result_concat.update(formatted_safety)
results[key] = result_concat
return results
return results
Original file line number Diff line number Diff line change
@@ -1,39 +1,45 @@
from azureml.metrics import constants
from enum import Enum


class RAIService:
"""Define constants related to RAI service"""
TIMEOUT = 1800
TIMEOUT = 120
SLEEPTIME = 2
HARM_SEVERITY_THRESHOLD = 4


class Metric:
"""Defines all metrics supported by RAI service"""
# Content harm
SelfHarm = "self_harm"
Violence = "violence"
Sexual = "sexual"
HateFairness = "hate_fairness"
HateUnfairness = "hate_unfairness"


class Tasks:
"""Defines types of annotation tasks supported by RAI Service."""
CONTENT_HARM = "content harm"


class HarmSeverityLevel(Enum):
Safe = 0
Low = 1
Medium = 2
High = 3
VeryLow = "Very low"
Low = "Low"
Medium = "Medium"
High = "High"


# Content harm metric set
CONTENT_HARM_METRICS = {
Metric.SelfHarm,
Metric.Violence,
Metric.Sexual,
Metric.HateFairness
Metric.HateUnfairness
}


RAG_EVALUATION_SET = constants.Metric.RAG_EVALUATION_SET
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence, constants.Metric.GPTFluency}
NON_RAG_EVALUATION_SET = {constants.Metric.GPTCoherence,
constants.Metric.GPTFluency}
QUALITY_METRIC_SET = RAG_EVALUATION_SET | NON_RAG_EVALUATION_SET
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from promptflow import tool
import json


def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")


def construct_single_request(question: str,
answer: str,
context: dict = None) -> dict:
metrics = ["generic_groundedness"]
user_text = json.dumps({
"question": question,
"answer": answer,
"context": context})
parsed_user_text = normalize_user_text(user_text)
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "groundedness",
"MetricList": metrics}
return request_body


@tool
def construct_groundedness_requests(parsed_chat: dict) -> str:
num_turns = len(parsed_chat["questions"])
request_bodies = []
for i in range(num_turns):
question = parsed_chat["questions"][i]
answer = parsed_chat["answers"][i]
try:
retrieved_documents = eval(
parsed_chat["retrieved_documents"][i])
except Exception:
retrieved_documents = [
parsed_chat["retrieved_documents"][i]]
context = {"citations": retrieved_documents}
request = construct_single_request(question,
answer,
context)
request_bodies.append(request)
return request_bodies
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from promptflow import tool
import json


def parse_chat(user_text: list):
parsed_chat = []
for turn in user_text:
Expand All @@ -17,19 +18,21 @@ def parse_chat(user_text: list):
content_str = json.dumps(turn) + "\n"
parsed_chat.append(content_str)
return "".join(parsed_chat)



def normalize_user_text(user_text):
return user_text.replace("'", "\\\"")

# The inputs section will change based on the arguments of the tool function, after you save the code
# Adding type to arguments and return value will help the system show the types properly
# Please update the function name/signature per need

@tool
def construct_request(user_text: list, selected_metrics: dict) -> dict:
selected_safety_metrics = selected_metrics["safety_metrics"]
metrics = [metric for metric in selected_safety_metrics if selected_safety_metrics[metric]]
metrics = [metric.replace("_unfairness", "_fairness") for metric in
selected_safety_metrics if selected_safety_metrics[metric]]
parsed_user_text = parse_chat(user_text)
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "content harm",
"MetricList": metrics}
request_body = {"UserTextList": [parsed_user_text],
"AnnotationTask": "content harm",
"MetricList": metrics,
"PromptVersion": "0.2"
}
return request_body
Loading

0 comments on commit b03b460

Please sign in to comment.