From 4655774b587ceda58cf81c049a32bd768d9e023a Mon Sep 17 00:00:00 2001 From: Xinyao Wang Date: Thu, 29 Aug 2024 13:17:47 +0800 Subject: [PATCH 1/4] fix ragas to align latest code Signed-off-by: Xinyao Wang --- evals/metrics/ragas/ragas.py | 26 +++++++++++++++++--------- tests/test_ragas.py | 8 ++++---- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index b163b9b8..ca66ddf1 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -32,13 +32,14 @@ def __init__( self.embeddings = embeddings self.metrics = metrics self.validated_list = [ - "answer_relevancy", - "faithfulness", "answer_correctness", + "answer_relevancy", "answer_similarity", "context_precision", - "context_relevancy", "context_recall", + "faithfulness", + "context_utilization", + "reference_free_rubrics_score", ] async def a_measure(self, test_case: Dict): @@ -55,8 +56,9 @@ def measure(self, test_case: Dict): answer_similarity, context_precision, context_recall, - context_relevancy, faithfulness, + context_utilization, + reference_free_rubrics_score, ) except ModuleNotFoundError: @@ -67,8 +69,14 @@ def measure(self, test_case: Dict): except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") self.metrics_instance = { + "answer_correctness": answer_correctness, "answer_relevancy": answer_relevancy, + "answer_similarity": answer_similarity, + "context_precision": context_precision, + "context_recall": context_recall, "faithfulness": faithfulness, + "context_utilization": context_utilization, + "reference_free_rubrics_score": reference_free_rubrics_score, } # Set LLM model @@ -101,7 +109,7 @@ def measure(self, test_case: Dict): else: if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") - tmp_metrics.append(metric) + tmp_metrics.append(self.metrics_instance[metric]) self.metrics = tmp_metrics else: self.metrics = [ @@ -115,10 +123,10 @@ def measure(self, test_case: Dict): ] data = { - "question": test_case["input"], - "contexts": test_case["retrieval_context"], - "answer": test_case["actual_output"], - "ground_truth": test_case["expected_output"], + "question": test_case["question"], + "contexts": test_case["contexts"], + "answer": test_case["answer"], + "ground_truth": test_case["ground_truth"], } dataset = Dataset.from_dict(data) diff --git a/tests/test_ragas.py b/tests/test_ragas.py index 7d26067c..e11835ad 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -26,10 +26,10 @@ def test_ragas(self): embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings) test_case = { - "input": ["What if these shoes don't fit?"], - "actual_output": [actual_output], - "expected_output": [expected_output], - "retrieval_context": [retrieval_context], + "question": ["What if these shoes don't fit?"], + "answer": [actual_output], + "ground_truth": [expected_output], + "contexts": [retrieval_context], } metric.measure(test_case) From af4d08c30ed10b2ec80d77b754f56427b1700d7d Mon Sep 17 00:00:00 2001 From: Xinyao Wang Date: Thu, 29 Aug 2024 16:34:57 +0800 Subject: [PATCH 2/4] add FaqGen Accuracy scripts Signed-off-by: Xinyao Wang --- examples/FaqGen/README.md | 61 +++++++++++++++++++++++++++++ examples/FaqGen/evaluate.py | 44 +++++++++++++++++++++ examples/FaqGen/generate_FAQ.py | 24 ++++++++++++ examples/FaqGen/get_context.py | 12 ++++++ examples/FaqGen/launch_tgi.sh | 27 +++++++++++++ examples/FaqGen/post_process_FAQ.py | 24 ++++++++++++ 6 files changed, 192 insertions(+) create mode 100644 examples/FaqGen/README.md create mode 100644 examples/FaqGen/evaluate.py create mode 100644 examples/FaqGen/generate_FAQ.py create mode 100644 examples/FaqGen/get_context.py create mode 100644 examples/FaqGen/launch_tgi.sh create mode 100644 examples/FaqGen/post_process_FAQ.py diff --git a/examples/FaqGen/README.md b/examples/FaqGen/README.md new file mode 100644 index 00000000..eeb881d2 --- /dev/null +++ b/examples/FaqGen/README.md @@ -0,0 +1,61 @@ +## Dataset +We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records. + +First download dataset and put at "./data". + +Extract unique "context" columns, which will be save to 'data/sqv2_context.json': +``` +python get_context.py +``` + +## Generate FAQs + +### Launch FaQGen microservice +Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint. +``` +export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen" +``` + +### Generate FAQs with microservice +Use the microservice endpoint to generate FAQs for dataset. +``` +python generate_FAQ.py +``` + +Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'. +``` +python post_process_FAQ.py +``` + +## Evaluate with Ragas + +### Launch TGI service +We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi. +``` +export HUGGING_FACE_HUB_TOKEN="your_huggingface_token" +bash launch_tgi.sh +``` +Get the endpoint: +``` +export LLM_ENDPOINT = "http://${ip_address}:8082" +``` + +Verify the service: +```bash +curl http://${ip_address}:8082/generate \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ + -H 'Content-Type: application/json' +``` + +### Evaluate +evaluate the performance with the LLM: +``` +python evaluate.py +``` + +### Performance Result +Here is the tested result for your reference +| answer_relevancy | faithfulness | context_utilization | reference_free_rubrics_score | +| ---- | ---- |---- |---- | +| 0.7191 | 0.9681 | 0.8964 | 4.4125| diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py new file mode 100644 index 00000000..63f12349 --- /dev/null +++ b/examples/FaqGen/evaluate.py @@ -0,0 +1,44 @@ +from evals.metrics.ragas import RagasMetric +from langchain_community.embeddings import HuggingFaceBgeEmbeddings +import json, os + +llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082") + +f = open("data/sqv2_context.json","r") +sqv2_context = json.load(f) + +f = open("data/sqv2_faq.json","r") +sqv2_faq = json.load(f) + +templ = """Create a concise FAQs (frequently asked questions and answers) for following text: + TEXT: {text} + Do not use any prefix or suffix to the FAQ. + """ + +number = 1204 +question = [] +answer = [] +ground_truth = ["None"]*number +contexts = [] +for i in range(number): + inputs = sqv2_context[str(i)] + inputs_faq = templ.format_map({"text":inputs}) + actual_output = sqv2_faq[str(i)] + + question.append(inputs_faq) + answer.append(actual_output) + contexts.append([inputs_faq]) + +embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") +metrics_faq = ["answer_relevancy","faithfulness", "context_utilization","reference_free_rubrics_score"] +metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq) + +test_case = { + "question": question, + "answer": answer, + "ground_truth": ground_truth, + "contexts": contexts +} + +metric.measure(test_case) +print(metric.score) diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py new file mode 100644 index 00000000..11f83db7 --- /dev/null +++ b/examples/FaqGen/generate_FAQ.py @@ -0,0 +1,24 @@ +import json +import requests +import time, os + +llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen") + +f = open("data/sqv2_context.json","r") +sqv2_context = json.load(f) + +start_time = time.time() +headers = {"Content-Type": "application/json"} +for i in range(1204): + start_time_tmp = time.time() + print(i) + inputs = sqv2_context[str(i)] + data = {"query": inputs, "max_new_tokens": 128} + response = requests.post(llm_endpoint, json=data, headers=headers) + f = open(f"data/result/sqv2_faq_{i}", "w") + f.write(inputs) + f.write(str(response.content, encoding='utf-8')) + f.close() + print(f"Cost {time.time()-start_time_tmp} seconds") +print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n") + diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py new file mode 100644 index 00000000..5524cb1a --- /dev/null +++ b/examples/FaqGen/get_context.py @@ -0,0 +1,12 @@ +import pandas as pd +import json,os + +data_path = "./data" +data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet")) +sq_context = list(data["context"].unique()) +sq_context_d = dict() +for i in range(len(sq_context)): + sq_context_d[i] = sq_context[i] + +with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: + json.dump(sq_context_d, outfile) diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh new file mode 100644 index 00000000..2664b872 --- /dev/null +++ b/examples/FaqGen/launch_tgi.sh @@ -0,0 +1,27 @@ +max_input_tokens=3072 +max_total_tokens=4096 +port_number=8082 +model_name="mistralai/Mixtral-8x7B-Instruct-v0.1" +volume="./data" +docker run -it --rm \ + --name="tgi_Mixtral" \ + -p $port_number:80 \ + -v $volume:/data \ + --runtime=habana \ + --restart always \ + -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ + --cap-add=sys_nice \ + --ipc=host \ + -e HTTPS_PROXY=$https_proxy \ + -e HTTP_PROXY=$https_proxy \ + ghcr.io/huggingface/tgi-gaudi:2.0.1 \ + --model-id $model_name \ + --max-input-tokens $max_input_tokens \ + --max-total-tokens $max_total_tokens \ + --sharded true \ + --num-shard 2 + + diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py new file mode 100644 index 00000000..d122c909 --- /dev/null +++ b/examples/FaqGen/post_process_FAQ.py @@ -0,0 +1,24 @@ +import json + +faq_dict = {} +fails = [] +for i in range(1204): + data = open(f"data/result/sqv2_faq_{i}","r").readlines() + result = data[-6][6:] + # print(result) + if "LLMChain/final_output" not in result: + print(f"error1: fail for {i}") + fails.append(i) + continue + try: + result2 = json.loads(result) + result3 = result2["ops"][0]["value"]["text"] + faq_dict[str(i)] = result3 + except: + print(f"error2: fail for {i}") + fails.append(i) + continue +with open("data/sqv2_faq.json", "w") as outfile: + json.dump(faq_dict, outfile) +print("Failure index:") +print(fails) \ No newline at end of file From 66ddd5c4000683dfc8ab3852b00ce3c3e6dd2ee9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Aug 2024 09:17:39 +0000 Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/ragas/ragas.py | 2 +- examples/FaqGen/evaluate.py | 27 ++++++++++++++------------- examples/FaqGen/generate_FAQ.py | 12 ++++++++---- examples/FaqGen/get_context.py | 11 ++++++++--- examples/FaqGen/launch_tgi.sh | 5 +++-- examples/FaqGen/post_process_FAQ.py | 7 +++++-- 6 files changed, 39 insertions(+), 25 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index ca66ddf1..4c9a45e1 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -56,8 +56,8 @@ def measure(self, test_case: Dict): answer_similarity, context_precision, context_recall, - faithfulness, context_utilization, + faithfulness, reference_free_rubrics_score, ) diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py index 63f12349..a082d093 100644 --- a/examples/FaqGen/evaluate.py +++ b/examples/FaqGen/evaluate.py @@ -1,13 +1,19 @@ -from evals.metrics.ragas import RagasMetric +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os + from langchain_community.embeddings import HuggingFaceBgeEmbeddings -import json, os + +from evals.metrics.ragas import RagasMetric llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082") -f = open("data/sqv2_context.json","r") +f = open("data/sqv2_context.json", "r") sqv2_context = json.load(f) -f = open("data/sqv2_faq.json","r") +f = open("data/sqv2_faq.json", "r") sqv2_faq = json.load(f) templ = """Create a concise FAQs (frequently asked questions and answers) for following text: @@ -18,11 +24,11 @@ number = 1204 question = [] answer = [] -ground_truth = ["None"]*number +ground_truth = ["None"] * number contexts = [] for i in range(number): inputs = sqv2_context[str(i)] - inputs_faq = templ.format_map({"text":inputs}) + inputs_faq = templ.format_map({"text": inputs}) actual_output = sqv2_faq[str(i)] question.append(inputs_faq) @@ -30,15 +36,10 @@ contexts.append([inputs_faq]) embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") -metrics_faq = ["answer_relevancy","faithfulness", "context_utilization","reference_free_rubrics_score"] +metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"] metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq) -test_case = { - "question": question, - "answer": answer, - "ground_truth": ground_truth, - "contexts": contexts -} +test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts} metric.measure(test_case) print(metric.score) diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py index 11f83db7..2ed70b9e 100644 --- a/examples/FaqGen/generate_FAQ.py +++ b/examples/FaqGen/generate_FAQ.py @@ -1,10 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import json +import os +import time + import requests -import time, os llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen") -f = open("data/sqv2_context.json","r") +f = open("data/sqv2_context.json", "r") sqv2_context = json.load(f) start_time = time.time() @@ -17,8 +22,7 @@ response = requests.post(llm_endpoint, json=data, headers=headers) f = open(f"data/result/sqv2_faq_{i}", "w") f.write(inputs) - f.write(str(response.content, encoding='utf-8')) + f.write(str(response.content, encoding="utf-8")) f.close() print(f"Cost {time.time()-start_time_tmp} seconds") print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n") - diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py index 5524cb1a..8cb73a05 100644 --- a/examples/FaqGen/get_context.py +++ b/examples/FaqGen/get_context.py @@ -1,5 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os + import pandas as pd -import json,os data_path = "./data" data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet")) @@ -7,6 +12,6 @@ sq_context_d = dict() for i in range(len(sq_context)): sq_context_d[i] = sq_context[i] - -with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: + +with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: json.dump(sq_context_d, outfile) diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh index 2664b872..b3e04bbb 100644 --- a/examples/FaqGen/launch_tgi.sh +++ b/examples/FaqGen/launch_tgi.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + max_input_tokens=3072 max_total_tokens=4096 port_number=8082 @@ -23,5 +26,3 @@ docker run -it --rm \ --max-total-tokens $max_total_tokens \ --sharded true \ --num-shard 2 - - diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py index d122c909..83e6b835 100644 --- a/examples/FaqGen/post_process_FAQ.py +++ b/examples/FaqGen/post_process_FAQ.py @@ -1,9 +1,12 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import json faq_dict = {} fails = [] for i in range(1204): - data = open(f"data/result/sqv2_faq_{i}","r").readlines() + data = open(f"data/result/sqv2_faq_{i}", "r").readlines() result = data[-6][6:] # print(result) if "LLMChain/final_output" not in result: @@ -21,4 +24,4 @@ with open("data/sqv2_faq.json", "w") as outfile: json.dump(faq_dict, outfile) print("Failure index:") -print(fails) \ No newline at end of file +print(fails) From da250e59b9bc87ea921f9c8cb61eaf669d710acf Mon Sep 17 00:00:00 2001 From: Xinyao Wang Date: Mon, 2 Sep 2024 09:14:26 +0800 Subject: [PATCH 4/4] fix bug Signed-off-by: Xinyao Wang --- evals/metrics/ragas/ragas.py | 1 - 1 file changed, 1 deletion(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 4c9a45e1..9525ce07 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -118,7 +118,6 @@ def measure(self, test_case: Dict): answer_correctness, answer_similarity, context_precision, - context_relevancy, context_recall, ]