From 4655774b587ceda58cf81c049a32bd768d9e023a Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 29 Aug 2024 13:17:47 +0800
Subject: [PATCH 1/4] fix ragas to align latest code

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 evals/metrics/ragas/ragas.py | 26 +++++++++++++++++---------
 tests/test_ragas.py          |  8 ++++----
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index b163b9b8..ca66ddf1 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -32,13 +32,14 @@ def __init__(
         self.embeddings = embeddings
         self.metrics = metrics
         self.validated_list = [
-            "answer_relevancy",
-            "faithfulness",
             "answer_correctness",
+            "answer_relevancy",
             "answer_similarity",
             "context_precision",
-            "context_relevancy",
             "context_recall",
+            "faithfulness",
+            "context_utilization",
+            "reference_free_rubrics_score",
         ]
 
     async def a_measure(self, test_case: Dict):
@@ -55,8 +56,9 @@ def measure(self, test_case: Dict):
                 answer_similarity,
                 context_precision,
                 context_recall,
-                context_relevancy,
                 faithfulness,
+                context_utilization,
+                reference_free_rubrics_score,
             )
 
         except ModuleNotFoundError:
@@ -67,8 +69,14 @@ def measure(self, test_case: Dict):
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
         self.metrics_instance = {
+            "answer_correctness": answer_correctness,
             "answer_relevancy": answer_relevancy,
+            "answer_similarity": answer_similarity,
+            "context_precision": context_precision,
+            "context_recall": context_recall,
             "faithfulness": faithfulness,
+            "context_utilization": context_utilization,
+            "reference_free_rubrics_score": reference_free_rubrics_score,
         }
 
         # Set LLM model
@@ -101,7 +109,7 @@ def measure(self, test_case: Dict):
                 else:
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
-                    tmp_metrics.append(metric)
+                    tmp_metrics.append(self.metrics_instance[metric])
             self.metrics = tmp_metrics
         else:
             self.metrics = [
@@ -115,10 +123,10 @@ def measure(self, test_case: Dict):
             ]
 
         data = {
-            "question": test_case["input"],
-            "contexts": test_case["retrieval_context"],
-            "answer": test_case["actual_output"],
-            "ground_truth": test_case["expected_output"],
+            "question": test_case["question"],
+            "contexts": test_case["contexts"],
+            "answer": test_case["answer"],
+            "ground_truth": test_case["ground_truth"],
         }
         dataset = Dataset.from_dict(data)
 
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index 7d26067c..e11835ad 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -26,10 +26,10 @@ def test_ragas(self):
         embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
         metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings)
         test_case = {
-            "input": ["What if these shoes don't fit?"],
-            "actual_output": [actual_output],
-            "expected_output": [expected_output],
-            "retrieval_context": [retrieval_context],
+            "question": ["What if these shoes don't fit?"],
+            "answer": [actual_output],
+            "ground_truth": [expected_output],
+            "contexts": [retrieval_context],
         }
 
         metric.measure(test_case)

From af4d08c30ed10b2ec80d77b754f56427b1700d7d Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 29 Aug 2024 16:34:57 +0800
Subject: [PATCH 2/4] add FaqGen Accuracy scripts

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 examples/FaqGen/README.md           | 61 +++++++++++++++++++++++++++++
 examples/FaqGen/evaluate.py         | 44 +++++++++++++++++++++
 examples/FaqGen/generate_FAQ.py     | 24 ++++++++++++
 examples/FaqGen/get_context.py      | 12 ++++++
 examples/FaqGen/launch_tgi.sh       | 27 +++++++++++++
 examples/FaqGen/post_process_FAQ.py | 24 ++++++++++++
 6 files changed, 192 insertions(+)
 create mode 100644 examples/FaqGen/README.md
 create mode 100644 examples/FaqGen/evaluate.py
 create mode 100644 examples/FaqGen/generate_FAQ.py
 create mode 100644 examples/FaqGen/get_context.py
 create mode 100644 examples/FaqGen/launch_tgi.sh
 create mode 100644 examples/FaqGen/post_process_FAQ.py

diff --git a/examples/FaqGen/README.md b/examples/FaqGen/README.md
new file mode 100644
index 00000000..eeb881d2
--- /dev/null
+++ b/examples/FaqGen/README.md
@@ -0,0 +1,61 @@
+## Dataset 
+We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records.
+
+First download dataset and put at "./data".
+
+Extract unique "context" columns, which will be save to 'data/sqv2_context.json':
+```
+python get_context.py
+```
+
+## Generate FAQs
+
+### Launch FaQGen microservice
+Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint.
+```
+export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen"
+```
+
+### Generate FAQs with microservice
+Use the microservice endpoint to generate FAQs for dataset.
+```
+python generate_FAQ.py
+```
+
+Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'.
+```
+python post_process_FAQ.py
+```
+
+## Evaluate with Ragas
+
+### Launch TGI service
+We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi.
+```
+export HUGGING_FACE_HUB_TOKEN="your_huggingface_token"
+bash launch_tgi.sh
+```
+Get the endpoint:
+```
+export LLM_ENDPOINT = "http://${ip_address}:8082"
+```
+
+Verify the service:
+```bash
+curl http://${ip_address}:8082/generate \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \
+    -H 'Content-Type: application/json'
+```
+
+### Evaluate
+evaluate the performance with the LLM:
+```
+python evaluate.py
+```
+
+### Performance Result
+Here is the tested result for your reference
+|  answer_relevancy   | faithfulness  | context_utilization | reference_free_rubrics_score |
+|  ----  | ----  |----  |----  |
+| 0.7191	| 0.9681	| 0.8964 |	4.4125|
diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py
new file mode 100644
index 00000000..63f12349
--- /dev/null
+++ b/examples/FaqGen/evaluate.py
@@ -0,0 +1,44 @@
+from evals.metrics.ragas import RagasMetric
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+import json, os
+
+llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082")
+
+f = open("data/sqv2_context.json","r")
+sqv2_context = json.load(f)
+
+f = open("data/sqv2_faq.json","r")
+sqv2_faq = json.load(f)
+
+templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
+        TEXT: {text}
+        Do not use any prefix or suffix to the FAQ.
+    """
+
+number = 1204
+question = []
+answer = []
+ground_truth = ["None"]*number
+contexts = []
+for i in range(number):
+    inputs = sqv2_context[str(i)]
+    inputs_faq = templ.format_map({"text":inputs})
+    actual_output = sqv2_faq[str(i)]
+
+    question.append(inputs_faq)
+    answer.append(actual_output)
+    contexts.append([inputs_faq])
+
+embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
+metrics_faq = ["answer_relevancy","faithfulness", "context_utilization","reference_free_rubrics_score"]
+metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq)
+
+test_case = {
+    "question": question,
+    "answer": answer,
+    "ground_truth": ground_truth,
+    "contexts": contexts
+}
+
+metric.measure(test_case)
+print(metric.score)
diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py
new file mode 100644
index 00000000..11f83db7
--- /dev/null
+++ b/examples/FaqGen/generate_FAQ.py
@@ -0,0 +1,24 @@
+import json
+import requests
+import time, os
+
+llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen")
+
+f = open("data/sqv2_context.json","r")
+sqv2_context = json.load(f)
+
+start_time = time.time()
+headers = {"Content-Type": "application/json"}
+for i in range(1204):
+    start_time_tmp = time.time()
+    print(i)
+    inputs = sqv2_context[str(i)]
+    data = {"query": inputs, "max_new_tokens": 128}
+    response = requests.post(llm_endpoint, json=data, headers=headers)
+    f = open(f"data/result/sqv2_faq_{i}", "w")
+    f.write(inputs)
+    f.write(str(response.content, encoding='utf-8'))
+    f.close()
+    print(f"Cost {time.time()-start_time_tmp} seconds")
+print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n")
+
diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py
new file mode 100644
index 00000000..5524cb1a
--- /dev/null
+++ b/examples/FaqGen/get_context.py
@@ -0,0 +1,12 @@
+import pandas as pd
+import json,os
+
+data_path = "./data"
+data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet"))
+sq_context = list(data["context"].unique())
+sq_context_d = dict()
+for i in range(len(sq_context)):
+    sq_context_d[i] = sq_context[i]
+  
+with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: 
+    json.dump(sq_context_d, outfile)
diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh
new file mode 100644
index 00000000..2664b872
--- /dev/null
+++ b/examples/FaqGen/launch_tgi.sh
@@ -0,0 +1,27 @@
+max_input_tokens=3072
+max_total_tokens=4096
+port_number=8082
+model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
+volume="./data"
+docker run -it --rm \
+    --name="tgi_Mixtral" \
+    -p $port_number:80 \
+    -v $volume:/data \
+    --runtime=habana \
+    --restart always \
+    -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \
+    -e HABANA_VISIBLE_DEVICES=all \
+    -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+    -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \
+    --cap-add=sys_nice \
+    --ipc=host \
+    -e HTTPS_PROXY=$https_proxy \
+    -e HTTP_PROXY=$https_proxy \
+    ghcr.io/huggingface/tgi-gaudi:2.0.1 \
+    --model-id $model_name \
+    --max-input-tokens $max_input_tokens \
+    --max-total-tokens $max_total_tokens \
+    --sharded true \
+    --num-shard 2
+
+
diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py
new file mode 100644
index 00000000..d122c909
--- /dev/null
+++ b/examples/FaqGen/post_process_FAQ.py
@@ -0,0 +1,24 @@
+import json
+
+faq_dict = {}
+fails = []
+for i in range(1204):
+    data = open(f"data/result/sqv2_faq_{i}","r").readlines()
+    result = data[-6][6:]
+    # print(result)
+    if "LLMChain/final_output" not in result:
+        print(f"error1: fail for {i}")
+        fails.append(i)
+        continue
+    try:
+        result2 = json.loads(result)
+        result3 = result2["ops"][0]["value"]["text"]
+        faq_dict[str(i)] = result3
+    except:
+        print(f"error2: fail for {i}")
+        fails.append(i)
+        continue
+with open("data/sqv2_faq.json", "w") as outfile:
+    json.dump(faq_dict, outfile)
+print("Failure index:")
+print(fails)
\ No newline at end of file

From 66ddd5c4000683dfc8ab3852b00ce3c3e6dd2ee9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 Aug 2024 09:17:39 +0000
Subject: [PATCH 3/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragas/ragas.py        |  2 +-
 examples/FaqGen/evaluate.py         | 27 ++++++++++++++-------------
 examples/FaqGen/generate_FAQ.py     | 12 ++++++++----
 examples/FaqGen/get_context.py      | 11 ++++++++---
 examples/FaqGen/launch_tgi.sh       |  5 +++--
 examples/FaqGen/post_process_FAQ.py |  7 +++++--
 6 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index ca66ddf1..4c9a45e1 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -56,8 +56,8 @@ def measure(self, test_case: Dict):
                 answer_similarity,
                 context_precision,
                 context_recall,
-                faithfulness,
                 context_utilization,
+                faithfulness,
                 reference_free_rubrics_score,
             )
 
diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py
index 63f12349..a082d093 100644
--- a/examples/FaqGen/evaluate.py
+++ b/examples/FaqGen/evaluate.py
@@ -1,13 +1,19 @@
-from evals.metrics.ragas import RagasMetric
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-import json, os
+
+from evals.metrics.ragas import RagasMetric
 
 llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082")
 
-f = open("data/sqv2_context.json","r")
+f = open("data/sqv2_context.json", "r")
 sqv2_context = json.load(f)
 
-f = open("data/sqv2_faq.json","r")
+f = open("data/sqv2_faq.json", "r")
 sqv2_faq = json.load(f)
 
 templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
@@ -18,11 +24,11 @@
 number = 1204
 question = []
 answer = []
-ground_truth = ["None"]*number
+ground_truth = ["None"] * number
 contexts = []
 for i in range(number):
     inputs = sqv2_context[str(i)]
-    inputs_faq = templ.format_map({"text":inputs})
+    inputs_faq = templ.format_map({"text": inputs})
     actual_output = sqv2_faq[str(i)]
 
     question.append(inputs_faq)
@@ -30,15 +36,10 @@
     contexts.append([inputs_faq])
 
 embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-metrics_faq = ["answer_relevancy","faithfulness", "context_utilization","reference_free_rubrics_score"]
+metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"]
 metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq)
 
-test_case = {
-    "question": question,
-    "answer": answer,
-    "ground_truth": ground_truth,
-    "contexts": contexts
-}
+test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts}
 
 metric.measure(test_case)
 print(metric.score)
diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py
index 11f83db7..2ed70b9e 100644
--- a/examples/FaqGen/generate_FAQ.py
+++ b/examples/FaqGen/generate_FAQ.py
@@ -1,10 +1,15 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
+import os
+import time
+
 import requests
-import time, os
 
 llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen")
 
-f = open("data/sqv2_context.json","r")
+f = open("data/sqv2_context.json", "r")
 sqv2_context = json.load(f)
 
 start_time = time.time()
@@ -17,8 +22,7 @@
     response = requests.post(llm_endpoint, json=data, headers=headers)
     f = open(f"data/result/sqv2_faq_{i}", "w")
     f.write(inputs)
-    f.write(str(response.content, encoding='utf-8'))
+    f.write(str(response.content, encoding="utf-8"))
     f.close()
     print(f"Cost {time.time()-start_time_tmp} seconds")
 print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n")
-
diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py
index 5524cb1a..8cb73a05 100644
--- a/examples/FaqGen/get_context.py
+++ b/examples/FaqGen/get_context.py
@@ -1,5 +1,10 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+
 import pandas as pd
-import json,os
 
 data_path = "./data"
 data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet"))
@@ -7,6 +12,6 @@
 sq_context_d = dict()
 for i in range(len(sq_context)):
     sq_context_d[i] = sq_context[i]
-  
-with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: 
+
+with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile:
     json.dump(sq_context_d, outfile)
diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh
index 2664b872..b3e04bbb 100644
--- a/examples/FaqGen/launch_tgi.sh
+++ b/examples/FaqGen/launch_tgi.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 max_input_tokens=3072
 max_total_tokens=4096
 port_number=8082
@@ -23,5 +26,3 @@ docker run -it --rm \
     --max-total-tokens $max_total_tokens \
     --sharded true \
     --num-shard 2
-
-
diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py
index d122c909..83e6b835 100644
--- a/examples/FaqGen/post_process_FAQ.py
+++ b/examples/FaqGen/post_process_FAQ.py
@@ -1,9 +1,12 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 
 faq_dict = {}
 fails = []
 for i in range(1204):
-    data = open(f"data/result/sqv2_faq_{i}","r").readlines()
+    data = open(f"data/result/sqv2_faq_{i}", "r").readlines()
     result = data[-6][6:]
     # print(result)
     if "LLMChain/final_output" not in result:
@@ -21,4 +24,4 @@
 with open("data/sqv2_faq.json", "w") as outfile:
     json.dump(faq_dict, outfile)
 print("Failure index:")
-print(fails)
\ No newline at end of file
+print(fails)

From da250e59b9bc87ea921f9c8cb61eaf669d710acf Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Mon, 2 Sep 2024 09:14:26 +0800
Subject: [PATCH 4/4] fix bug

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 evals/metrics/ragas/ragas.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 4c9a45e1..9525ce07 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -118,7 +118,6 @@ def measure(self, test_case: Dict):
                 answer_correctness,
                 answer_similarity,
                 context_precision,
-                context_relevancy,
                 context_recall,
             ]