From be548039cb0d54903aca5477d3124c02ceca3d5d Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Thu, 22 Aug 2024 16:29:01 -0700
Subject: [PATCH 01/18] add crag eval first pass code

---
 evals/evaluation/crag_eval/README.md          |  79 +++++++++++++
 .../crag_eval/preprocess_data/process_data.py | 111 ++++++++++++++++++
 .../preprocess_data/run_data_preprocess.sh    |   5 +
 .../preprocess_data/run_sample_data.sh        |   3 +
 .../crag_eval/preprocess_data/sample_data.py  |  26 ++++
 evals/evaluation/crag_eval/requirements.txt   |   1 +
 .../run_benchmark/generate_answers.py         |  71 +++++++++++
 .../crag_eval/run_benchmark/grade_answers.py  |   8 ++
 .../run_benchmark/run_generate_answer.sh      |  13 ++
 9 files changed, 317 insertions(+)
 create mode 100644 evals/evaluation/crag_eval/README.md
 create mode 100644 evals/evaluation/crag_eval/preprocess_data/process_data.py
 create mode 100644 evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
 create mode 100644 evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
 create mode 100644 evals/evaluation/crag_eval/preprocess_data/sample_data.py
 create mode 100644 evals/evaluation/crag_eval/requirements.txt
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/generate_answers.py
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/grade_answers.py
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
new file mode 100644
index 00000000..5f68b24c
--- /dev/null
+++ b/evals/evaluation/crag_eval/README.md
@@ -0,0 +1,79 @@
+# CRAG Benchmark for Agent QnA systems
+## Overview
+[Comprehensive RAG (CRAG) benchmark](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024) was introduced by Meta in 2024 as a challenge in KDD conference. The CRAG benchmark has questions across five domains and eight question types, and provides a practical set-up to evaluate RAG systems. In particular, CRAG includes questions with answers that change from over seconds to over years; it considers entity popularity and covers not only head, but also torso and tail facts; it contains simple-fact questions as well as 7 types of complex questions such as comparison, aggregation and set questions to test the reasoning and synthesis capabilities of RAG solutions. Additionally, CRAG also provides mock APIs to query mock knowledge graphs so that developers can benchmark additional API calling capabilities for agents. Moreover, golden answers were provided in the dataset, which makes auto-evaluation with LLMs more robust. Therefore, CRAG benchmark is a realistic and comprehensive benchmark for agents.
+
+## Getting started
+1. Setup a work directory and download this repo into your work directory.
+```
+export $WORKDIR=<your-work-directory>
+cd $WORKDIR
+git clone https://github.com/opea-project/GenAIEval.git
+```
+2. Create conda environment and install packages
+```
+conda create -n crag-benchmark-env python=3.11
+conda activate crag-benchmark-env
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/
+pip install -r requirements.txt
+```
+## CRAG dataset
+1. Download original data and process it with commands below.
+You need to create an account on the Meta CRAG challenge [website](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024). After login, go to this [link](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024/problems/meta-kdd-cup-24-crag-end-to-end-retrieval-augmented-generation/dataset_files) and download the `crag_task_3_dev_v4.tar.bz2` file. Then make a `datasets` directory in your work directory using the commands below.
+```
+cd $WORKDIR
+mkdir datasets
+```
+Then put the `crag_task_3_dev_v4.tar.bz2` file in the `datasets` directory, and decompress it by running the command below.
+```
+cd $WORKDIR/datasets
+tar -xf crag_task_3_dev_v4.tar.bz2
+```
+2. Preprocess the CRAG data
+Data preprocessing directly relates to the quality of retrieval corpus and thus can have significant impact on the agent QnA system. Here, we provide one way of preprocessing the data where we simply extracts all the web search snippets as-is from the dataset per domain. We also extract all the query-answer pairs along with other meta data per domain. You can run the command below to use our method. The data processing will take some time to finish.
+```
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data
+bash run_data_preprocess.sh
+```
+3. Optional - Sample queries for benchmark
+The CRAG dataset has more than 4000 queries, and running all of them can be very expensive and time-consuming. You can sample a subset for benchmark. Here we provide a script to sample up to 5 queries per question_type per dynamism in each domain. For example, we were able to get 92 queries from the music domain using the script.
+```
+bash run_sample_data.sh
+```
+3. Use the small subset that we have processed for a quick run
+```
+Small data files in this repo
+```
+## Launch agent QnA system
+Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
+1. Build images
+```
+git clone
+cd GenAIExamples/AgentQnA/tests/
+bash 1_build_images.sh
+```
+2. Start retrieval tool
+```
+bash 2_start_retrieval_tool.sh
+```
+3. Ingest data into vector database and validate retrieval tool
+```
+# Placeholder - may change depending on data
+bash 3_ingest_data_and_validate_retrieval.sh
+```
+3. Launch and validate agent endpoint
+```
+bash 4_launch_and_validate_agent.sh
+```
+
+## Run CRAG benchmark
+Once you have your agent system up and running, you can follow the steps below to run the benchmark.
+1. Generate answers with agent
+Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
+```
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark
+bash run_generate_answer.sh
+```
+2. Use LLM-as-judge to grade the answers
+```
+```
+
diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py
new file mode 100644
index 00000000..6f0c6775
--- /dev/null
+++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py
@@ -0,0 +1,111 @@
+import json
+import os
+import argparse
+import tqdm
+
+def split_text(text, chunk_size=2000, chunk_overlap=400):
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+    text_splitter = RecursiveCharacterTextSplitter(
+        # Set a really small chunk size, just to show.
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+        length_function=len,
+        is_separator_regex=False,
+        separators=["\n\n", "\n", ".", "!"],
+    )
+    return text_splitter.split_text(text)
+
+def process_html_string(text):
+    from bs4 import BeautifulSoup
+    # print(text)
+    soup = BeautifulSoup(text, features="html.parser")
+
+    # kill all script and style elements
+    for script in soup(["script", "style"]):
+        script.extract()    # rip it out
+
+    # get text
+    text_content = soup.get_text()
+
+    # break into lines and remove leading and trailing space on each
+    lines = (line.strip() for line in text_content.splitlines())
+    # break multi-headlines into a line each
+    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+    # drop blank lines
+    final_text = '\n'.join(chunk for chunk in chunks if chunk)
+    # print(final_text)
+    return final_text
+
+def preprocess_data(input_file):
+    snippet = []
+    return_data = []
+    n = 0
+    with open(input_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+
+            # search results snippets --> retrieval corpus docs
+            docs = data['search_results']
+            
+            for doc in docs:
+                # chunks = split_text(doc['page_snippet'])
+                # for chunk in chunks:
+                #     snippet.append({
+                #         "query": data['query'],
+                #         "domain": data['domain'],
+                #         "doc":chunk})
+                snippet.append({
+                    "query": data['query'],
+                    "domain": data['domain'],
+                    "doc":doc['page_snippet']})                
+                print('-----------------------------------')
+            
+            # qa pairs without search results
+            output = {}
+            for k, v in data.items():
+                if k != 'search_results':
+                    output[k] = v
+            return_data.append(output)
+
+    return snippet, return_data
+
+    
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filedir', type=str, default=None)
+    parser.add_argument('--docout', type=str, default=None)
+    parser.add_argument('--qaout', type=str, default=None)
+    # parser.add_argument('--chunk_size', type=int, default=10000)
+    # parser.add_argument('--chunk_overlap', type=int, default=0)
+    
+    args = parser.parse_args()
+
+    if not os.path.exists(args.docout):
+        os.makedirs(args.docout)
+
+    if not os.path.exists(args.qaout):
+        os.makedirs(args.qaout)
+
+    data_files = os.listdir(args.filedir)
+
+    qa_pairs = []
+    docs = []
+    for file in tqdm.tqdm(data_files):
+        file = os.path.join(args.filedir, file)
+        doc, data = preprocess_data(file)
+        docs.extend(doc)
+        qa_pairs.extend(data)
+    
+    # group by domain
+    domains = ["finance", "music", "movie", "sports", "open"]
+
+    for domain in domains:
+        with open(os.path.join(args.docout, "crag_docs_"+domain + ".jsonl"), 'w') as f:
+            for doc in docs:
+                if doc['doc']!="" and doc['domain'] == domain:
+                    f.write(json.dumps(doc) + '\n')
+
+        with open(os.path.join(args.qaout, "crag_qa_"+domain + ".jsonl"), 'w') as f:
+            for d in qa_pairs:
+                if d['domain'] == domain:
+                    f.write(json.dumps(d) + '\n')
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
new file mode 100644
index 00000000..8ecfa94f
--- /dev/null
+++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
@@ -0,0 +1,5 @@
+FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4
+DOCOUT=$WORKDIR/datasets/crag_docs
+QAOUT=$WORKDIR/datasets/crag_qas
+
+python process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
new file mode 100644
index 00000000..f0bce763
--- /dev/null
+++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
@@ -0,0 +1,3 @@
+FILEDIR=$WORKDIR/datasets/crag_docs
+
+python sample_data.py --filedir $FILEDIR
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
new file mode 100644
index 00000000..1fcb4916
--- /dev/null
+++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
@@ -0,0 +1,26 @@
+import json
+import pandas as pd
+import os
+import argparse
+import tqdm
+
+def sample_data(input_file, output_file):
+    df = pd.read_json(input_file, lines=True, convert_dates=False)
+    # group by `question_type` and `static_or_dynamic`
+    df_grouped = df.groupby(['question_type', 'static_or_dynamic'])
+    # sample 5 rows from each group if there are more than 5 rows else return all rows
+    df_sampled = df_grouped.apply(lambda x: x.sample(5) if len(x) > 5 else x)
+    # save sampled data to output file
+    df_sampled.to_json(output_file, orient='records', lines=True)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--filedir', type=str, default=None)
+    
+    args = parser.parse_args()
+
+    data_files = os.listdir(args.filedir)
+    for file in tqdm.tqdm(data_files):
+        file = os.path.join(args.filedir, file)
+        output_file = file.replace('.jsonl', '_sampled.jsonl')
+        sample_data(file, output_file)
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/requirements.txt b/evals/evaluation/crag_eval/requirements.txt
new file mode 100644
index 00000000..1411a4a0
--- /dev/null
+++ b/evals/evaluation/crag_eval/requirements.txt
@@ -0,0 +1 @@
+pandas
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
new file mode 100644
index 00000000..6c55da3c
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
@@ -0,0 +1,71 @@
+import requests
+import json
+import argparse
+import os
+import pandas as pd
+
+def get_test_data(args):
+    if args.query_file.endswith('.jsonl'):
+        df = pd.read_json(args.query_file, lines=True, convert_dates=False)
+    elif args.query_file.endswith('.csv'):
+        df = pd.read_csv(args.query_file)
+    return df
+
+def generate_answer(url, prompt):
+    proxies = {"http": ""}
+    payload = {
+        "query":prompt,
+        } 
+    response = requests.post(url, json=payload, proxies=proxies)
+    answer = response.json()["text"]
+    return answer
+
+def save_results(output_file, output_list):       
+    with open(output_file, "w") as f:
+        for output in output_list:
+            f.write(json.dumps(output))
+            f.write("\n")
+
+def save_as_csv(output):
+    df = pd.read_json(output, lines=True, convert_dates=False)
+    df.to_csv(output.replace(".jsonl", ".csv"), index=False)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--endpoint_url", type=str, default=None, help="url of the agent QnA system endpoint")
+    parser.add_argument("--query_file", type=str, default=None, help="query jsonl file")
+    parser.add_argument("--output_file", type=str, default="output.jsonl", help="output jsonl file")
+    args = parser.parse_args()
+    
+    url = args.endpoint_url
+
+    df = get_test_data(args)
+    # df = df.head() # for validation purpose
+
+    output_list = []
+    n = 0
+    for _, row in df.iterrows():
+        q = row['query']
+        t = row['query_time']
+        prompt = "Question: {}\nThe question was asked at: {}".format(q, t)
+        print('******Query:\n',prompt)
+        print("******Agent is working on the query")
+        answer = generate_answer(url, prompt)
+        print('******Answer from agent:\n',answer)
+        print('='*50)
+        output_list.append(
+                {
+                    "query": q,
+                    "query_time": t,
+                    "ref_answer": row["answer"],
+                    "answer": answer,
+                    "question_type": row["question_type"],
+                    "static_or_dynamic": row["static_or_dynamic"],
+                }
+            )
+        save_results(args.output_file, output_list)
+        # n += 1
+        # if n > 1:
+        #     break
+    save_results(args.output_file, output_list)
+    save_as_csv(args.output_file)
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
new file mode 100644
index 00000000..b4110a5e
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -0,0 +1,8 @@
+from evals.metrics.answer_relevancy import AnswerRelevancyMetric
+from evals.metrics.ragas import RagasMetric
+from evals.metrics import bleu_score, rougeL_score
+
+# check data format requirements for each metric
+# check answer relevancy vs ragas answer relevancy
+# check answer correctness
+# check if open-source llm can be used
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
new file mode 100644
index 00000000..24b477ee
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
@@ -0,0 +1,13 @@
+host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the service
+port=9095 # change this to the port of the service
+endpoint=${port}/v1/chat/completions # change this to the endpoint of the service
+URL="http://${host_ip}:${endpoint}"
+echo "AGENT ENDPOINT URL: ${URL}"
+
+QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl
+OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl
+
+python generate_answers.py \
+--endpoint_url ${URL} \
+--query_file $QUERYFILE \
+--output_file $OUTPUTFILE

From 6da78fe7f0735e42f3040415ede3b5ed350dfe86 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Fri, 23 Aug 2024 13:31:31 -0700
Subject: [PATCH 02/18] add first pass llm eval code

---
 evals/evaluation/crag_eval/README.md          | 28 +++++++++--
 evals/evaluation/crag_eval/requirements.txt   |  1 -
 .../crag_eval/run_benchmark/grade_answers.py  | 50 ++++++++++++++++---
 .../llm_judge/docker-compose-llm-judge.yaml   | 19 +++++++
 .../llm_judge/launch_llm_judge_endpoint.sh    |  4 ++
 .../crag_eval/run_benchmark/run_grading.sh    |  8 +++
 6 files changed, 98 insertions(+), 12 deletions(-)
 delete mode 100644 evals/evaluation/crag_eval/requirements.txt
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/run_grading.sh

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
index 5f68b24c..52e2e028 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/crag_eval/README.md
@@ -9,13 +9,16 @@ export $WORKDIR=<your-work-directory>
 cd $WORKDIR
 git clone https://github.com/opea-project/GenAIEval.git
 ```
-2. Create conda environment and install packages
+2. Build docker image
 ```
-conda create -n crag-benchmark-env python=3.11
-conda activate crag-benchmark-env
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/
-pip install -r requirements.txt
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/
+bash build_image.sh
 ```
+3. Start docker container
+```
+bash launch_eval_container.sh
+```
+
 ## CRAG dataset
 1. Download original data and process it with commands below.
 You need to create an account on the Meta CRAG challenge [website](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024). After login, go to this [link](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024/problems/meta-kdd-cup-24-crag-end-to-end-retrieval-augmented-generation/dataset_files) and download the `crag_task_3_dev_v4.tar.bz2` file. Then make a `datasets` directory in your work directory using the commands below.
@@ -74,6 +77,21 @@ cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark
 bash run_generate_answer.sh
 ```
 2. Use LLM-as-judge to grade the answers
+First, in another terminal, launch llm endpoint with HF TGI
+```
+cd llm_judge
+bash launch_llm_judge_endpoint.sh
+```
+Validate that the llm endpoint is working properly.
+```
+export host_ip=$(hostname -I | awk '{print $1}')
+curl ${host_ip}:8085/generate_stream \
+    -X POST \
+    -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
+    -H 'Content-Type: application/json'
+```
+Second, back to the interactive crag-eval docker, run command below
 ```
+bash run_grading.sh
 ```
 
diff --git a/evals/evaluation/crag_eval/requirements.txt b/evals/evaluation/crag_eval/requirements.txt
deleted file mode 100644
index 1411a4a0..00000000
--- a/evals/evaluation/crag_eval/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pandas
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index b4110a5e..35413e8d 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -1,8 +1,46 @@
-from evals.metrics.answer_relevancy import AnswerRelevancyMetric
 from evals.metrics.ragas import RagasMetric
-from evals.metrics import bleu_score, rougeL_score
+import argparse
+import pandas as pd
+import os
+
+def convert_data_format_for_ragas(data):
+    # data: pandas dataframe
+    # columns: ['query', 'answer', 'ref_answer']
+    # return: a dict with keys: 'input', 'actual_output', 'expected_output'
+    output = {
+        'input': data['query'].tolist(),
+        'actual_output': data['answer'].tolist(),
+        'expected_output': data['ref_answer'].tolist(),
+        'retrieval_context': data['ref_answer'].tolist()
+    }
+    return output
+
+
+def grade_answers(args, test_case):
+    from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+    print('==============getting embeddings==============')
+    embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
+    print('==============initiating metric==============')
+    metric = RagasMetric(threshold=0.5, 
+                         metrics=["answer_correctness"],
+                         model= args.llm_endpoint, 
+                         embeddings=embeddings)
+    print('==============start grading==============')
+    metric.measure(test_case)
+    print(metric.score)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--embed_model", type=str, default="BAAI/bge-base-en-v1.5")
+    parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008")
+    parser.add_argument("--filedir", type=str, help="Path to the file containing the data")
+    parser.add_argument("--filename", type=str, help="Name of the file containing the data")
+    args = parser.parse_args()
+
+    data = pd.read_csv(os.path.join(args.filedir, args.filename))
+    data = data.head(2)
+    print(data)
+    test_case = convert_data_format_for_ragas(data)
+    print(test_case)
+    grade_answers(args, test_case)
 
-# check data format requirements for each metric
-# check answer relevancy vs ragas answer relevancy
-# check answer correctness
-# check if open-source llm can be used
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
new file mode 100644
index 00000000..4f6f7f23
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
@@ -0,0 +1,19 @@
+version: "3.8"
+
+services:
+  tgi_service:
+    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    container_name: tgi-service
+    ports:
+      - "8085:80"
+    volumes:
+      - ${HF_CACHE_DIR}:/data
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+    command: --model-id ${LLM_MODEL_ID}
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
new file mode 100644
index 00000000..d927cc05
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
@@ -0,0 +1,4 @@
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" #
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export HF_CACHE_DIR=${HF_CACHE_DIR}
+docker compose -f docker-compose-llm-judge.yaml up -d
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
new file mode 100644
index 00000000..ff5a22f7
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
@@ -0,0 +1,8 @@
+FILEDIR=$WORKDIR/datasets/crag_results/
+FILENAME=crag_20queries_react_docgradertool_top5apis_v2sysm_gpt4omini.csv
+LLM_ENDPOINT=http://${host_ip}:8085
+
+python3 grade_answers.py \
+--filedir $FILEDIR \
+--filename $FILENAME \
+--llm_endpoint $LLM_ENDPOINT \

From d57f9d5c26428bae0899bdb677c0ce5335055ef8 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 09:53:40 -0700
Subject: [PATCH 03/18] fix answer correctness code

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 evals/evaluation/crag_eval/README.md          | 12 +++++++--
 .../crag_eval/run_benchmark/grade_answers.py  |  3 ++-
 evals/metrics/ragas/ragas.py                  | 25 +++++++++++++++++--
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
index 52e2e028..aa7097be 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/crag_eval/README.md
@@ -14,7 +14,15 @@ git clone https://github.com/opea-project/GenAIEval.git
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/
 bash build_image.sh
 ```
-3. Start docker container
+3. Set environment vars for downloading models from Huggingface
+```
+mkdir $WORKDIR/hf_cache 
+export HF_CACHE_DIR=$WORKDIR/hf_cache
+export HF_HOME=$HF_CACHE_DIR
+export HUGGINGFACEHUB_API_TOKEN=<your-hf-api-token>
+```
+4. Start docker container
+This container will be used to preprocess dataset and run benchmark scripts.
 ```
 bash launch_eval_container.sh
 ```
@@ -42,7 +50,7 @@ The CRAG dataset has more than 4000 queries, and running all of them can be very
 ```
 bash run_sample_data.sh
 ```
-3. Use the small subset that we have processed for a quick run
+4. Use the small subset that we have processed for a quick run
 ```
 Small data files in this repo
 ```
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index 35413e8d..21c16db3 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -1,4 +1,5 @@
 from evals.metrics.ragas import RagasMetric
+from ragas.metrics import answer_correctness
 import argparse
 import pandas as pd
 import os
@@ -11,7 +12,7 @@ def convert_data_format_for_ragas(data):
         'input': data['query'].tolist(),
         'actual_output': data['answer'].tolist(),
         'expected_output': data['ref_answer'].tolist(),
-        'retrieval_context': data['ref_answer'].tolist()
+        'retrieval_context': [["dummy_context"] for _ in range(data['query'].shape[0])]
     }
     return output
 
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 4069a62c..945b8671 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -16,6 +16,27 @@ def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
 
+def get_metric(name: str):
+    validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"]
+    if name == "answer_relevancy":
+        from ragas.metrics import answer_relevancy
+        return answer_relevancy
+    elif name == "faithfulness":
+        from ragas.metrics import faithfulness
+        return faithfulness
+    elif name == "answer_correctness":
+        from ragas.metrics import answer_correctness
+        return answer_correctness
+    else:
+        raise ValueError(
+                        "metric should be in supported list {}. ".format(validated_list)
+                        + "ClientResponseError raised with LangchainLLM "
+                        + "when context_precision, context_recall ran. "
+                        + "Here are the related issues described in ragas "
+                        "https://github.com/explodinggradients/ragas/issues/934, "
+                        + "https://github.com/explodinggradients/ragas/issues/664."
+                    )
+
 class RagasMetric:
     """This metric checks if the output is more than 3 letters."""
 
@@ -41,7 +62,7 @@ def measure(self, test_case: Dict):
         # sends to server
         try:
             from ragas import evaluate
-            from ragas.metrics import answer_relevancy, faithfulness
+            from ragas.metrics import answer_relevancy, faithfulness, answer_correctness
 
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
@@ -85,7 +106,7 @@ def measure(self, test_case: Dict):
                 else:
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
-                    tmp_metrics.append(metric)
+                    tmp_metrics.append(get_metric(metric))
             self.metrics = tmp_metrics
         else:
             self.metrics = [

From a61efa8f127297e6871d9cfe3a18ecd0e1548530 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 09:59:45 -0700
Subject: [PATCH 04/18] docker container for crag eval

---
 evals/evaluation/crag_eval/docker/Dockerfile  | 23 +++++++++++++++++++
 .../crag_eval/docker/build_image.sh           |  9 ++++++++
 .../crag_eval/docker/launch_eval_container.sh |  4 ++++
 .../crag_eval/docker/requirements.txt         |  7 ++++++
 4 files changed, 43 insertions(+)
 create mode 100644 evals/evaluation/crag_eval/docker/Dockerfile
 create mode 100644 evals/evaluation/crag_eval/docker/build_image.sh
 create mode 100644 evals/evaluation/crag_eval/docker/launch_eval_container.sh
 create mode 100644 evals/evaluation/crag_eval/docker/requirements.txt

diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/crag_eval/docker/Dockerfile
new file mode 100644
index 00000000..0421000a
--- /dev/null
+++ b/evals/evaluation/crag_eval/docker/Dockerfile
@@ -0,0 +1,23 @@
+FROM ubuntu:22.04
+
+WORKDIR /home/user
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3.11  \
+    python3-pip \
+    libpoppler-cpp-dev \
+    wget \
+    git \
+    poppler-utils \
+    libmkl-dev
+
+COPY requirements.txt /home/user/requirements.txt
+
+RUN pip install -r requirements.txt
+
+RUN cd /home/user/ && \
+    git clone https://github.com/opea-project/GenAIEval.git
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIEval/
+
+WORKDIR /home/user
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/crag_eval/docker/build_image.sh
new file mode 100644
index 00000000..284762c7
--- /dev/null
+++ b/evals/evaluation/crag_eval/docker/build_image.sh
@@ -0,0 +1,9 @@
+dockerfile=Dockerfile
+
+docker build \
+    -f ${dockerfile} . \
+    -t crag-eval:latest \
+    --network=host \
+    --build-arg http_proxy=${http_proxy} \
+    --build-arg https_proxy=${https_proxy} \
+    --build-arg no_proxy=${no_proxy} \
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/crag_eval/docker/launch_eval_container.sh
new file mode 100644
index 00000000..32b27f8e
--- /dev/null
+++ b/evals/evaluation/crag_eval/docker/launch_eval_container.sh
@@ -0,0 +1,4 @@
+volume=$WORKDIR
+host_ip=$(hostname -I | awk '{print $1}')
+
+docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt
new file mode 100644
index 00000000..662f3146
--- /dev/null
+++ b/evals/evaluation/crag_eval/docker/requirements.txt
@@ -0,0 +1,7 @@
+pandas
+datasets
+ragas
+langchain-community
+evaluate
+jieba
+sentence_transformers
\ No newline at end of file

From 914cdc2a68b6307425445bb752250cb788ffc70d Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 10:00:15 -0700
Subject: [PATCH 05/18] sample data for testing

---
 .../run_benchmark/sample_answers.csv          | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/sample_answers.csv

diff --git a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv
new file mode 100644
index 00000000..790f6f6d
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv
@@ -0,0 +1,28 @@
+query,query_time,ref_answer,answer,context,num_llm_calls,total_tokens,question_type,static_or_dynamic,selected_tools
+how many reading and leeds festivals has the band foo fighters headlined?,"03/13/2024, 09:55:12 PT",foo fighters have headlined a total of 4 reading and leeds festivals.,"The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.","[[{'id': 'call_05BuB1KtO8uuJypQh6lecBVM', 'function': {'arguments': '{""query"":""Foo Fighters Reading and Leeds Festival headlining history""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Foo Fighters have a notable history of headlining the Reading and Leeds Festivals. They first headlined the festival in 2002 and returned to headline again in 2005. After a significant gap, they headlined once more in 2012. In 2019, they were announced as one of the headliners again, marking their return to the festival after several years. Overall, they have been a prominent presence at the festival since their first appearance in 1995.', 'The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.']",3,959,aggregation,slow-changing,"['get_members', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist']"
+how many songs has the band the beatles released that have been recorded at abbey road studios?,"03/13/2024, 09:54:32 PT",the beatles have released 190 songs that have been recorded at abbey road studios.,"The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.","[[{'id': 'call_wthw0YH3cxU3yzYMdhfyY2OI', 'function': {'arguments': '{""query"":""The Beatles songs recorded at Abbey Road Studios""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Beatles recorded most of their music at Abbey Road Studios, formerly known as EMI Recording Studios, from their inception in 1957 until their break-up in 1970. This includes iconic albums such as ""Abbey Road,"" ""Sgt. Pepper\'s Lonely Hearts Club Band,"" and ""The White Album."" The studio is renowned for its cultural and historical significance in the music industry, particularly for the Beatles\' extensive catalog.', 'The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.']",3,968,aggregation,static,"['get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_release_country', 'get_members', 'search_song_entity_by_name']"
+did the song the globe come out before the wedding?,"03/21/2024, 23:32:29 PT",yes,"The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""","[[{'id': 'call_7wKEamOXFKVmki6gklVACm0M', 'function': {'arguments': '{""song_name"": ""The Globe""}', 'name': 'get_song_release_date'}, 'type': 'function'}, {'id': 'call_rfOh7EoCSkyDRGPe7DvgKMFE', 'function': {'arguments': '{""song_name"": ""The Wedding""}', 'name': 'get_song_release_date'}, 'type': 'function'}], '{""result"": ""1991""}', '{""result"": ""2016-09-23""}', 'The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""']",2,839,comparison,static,"['get_song_release_country', 'search_song_entity_by_name', 'get_song_release_date', 'get_grammy_best_song_by_year', 'get_song_author']"
+"who has had more number one hits on the us billboard hot 100 chart, michael jackson or elvis presley?","03/13/2024, 09:43:49 PT","elvis presley has had more number one hits on the us billboard hot 100 chart than michael jackson, with 18 number one hits compared to jackson's 13.","Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.","[[{'id': 'call_i5H8SAsGgRTXnvI0sDm5wmrv', 'function': {'arguments': '{""query"": ""Michael Jackson number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}, {'id': 'call_Mgec4kbeE0z4h3QhJ0ENNTHU', 'function': {'arguments': '{""query"": ""Elvis Presley number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Michael Jackson had 13 number-one hits on the US Billboard Hot 100.', 'Elvis Presley had 18 number one hits on the US Billboard Hot 100 chart. His number one singles spent a total of 81 weeks at the top position, which is the most of any artist.', ""Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.""]",4,1136,comparison,static,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'search_song_entity_by_name', 'get_song_release_country']"
+when was alex van halen the drummer for the band santana?,"03/13/2024, 10:04:31 PT",invalid question,Invalid question.,['Invalid question.'],1,343,false_premise,static,"['get_members', 'get_song_author', 'get_song_release_date', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_song']"
+"when did dolly parton's song, blown away, come out?","03/10/2024, 23:28:06 PT",invalid question,"Carrie Underwood's song ""Blown Away"" was released on May 1, 2012.","[[{'id': 'call_7ZIr0njVYoqlfvoCdd6pLmHb', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_release_country'}, 'type': 'function'}], '{""result"": ""ID""}', [{'id': 'call_ZUYqfFFb4VRGVHwhrG3nsV8L', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_author'}, 'type': 'function'}], '{""result"": ""Carrie Underwood""}', [{'id': 'call_mA2bAMtysJClId77OoUan9yO', 'function': {'arguments': '{""query"":""Blown Away Carrie Underwood release date""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Carrie Underwood\'s album ""Blown Away"" was released on May 1, 2012.', 'Carrie Underwood\'s song ""Blown Away"" was released on May 1, 2012.']",5,1798,false_premise,static,"['get_grammy_award_count_by_song', 'get_song_author', 'get_song_release_country', 'get_grammy_best_song_by_year', 'get_grammy_award_date_by_artist']"
+what grammy award did edgar barrera win this year?,"03/05/2024, 23:25:22 PT",invalid question,"Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.","[[{'id': 'call_ptgqp9zavnUvzSqmBNPNa8R3', 'function': {'arguments': '{""artist_name"":""Edgar Barrera""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": [2014]}', 'Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.']",2,828,false_premise,static,"['get_grammy_award_date_by_artist', 'get_grammy_best_artist_by_year', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_artist', 'get_grammy_best_song_by_year']"
+when did miley cyrus win grammy best new artist award?,"03/21/2024, 23:38:14 PT",invalid question,Miley Cyrus has not won the Grammy Best New Artist award.,"[[{'id': 'call_wCWNt9LOduR5IESTc1jeB6n4', 'function': {'arguments': '{""artist_name"":""Miley Cyrus""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', 'Miley Cyrus has not won the Grammy Best New Artist award.']",2,790,false_premise,static,"['get_all_awarded_artists', 'get_grammy_best_artist_by_year', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']"
+when was stevie wonder the keyboardist for the band the rolling stones?,"03/13/2024, 10:01:54 PT",invalid question,Invalid question.,['Invalid question.'],1,355,false_premise,static,"['get_grammy_award_date_by_artist', 'get_artist_birth_date', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song', 'get_artist_birth_place']"
+who is nicki minaj's latest diss track about?,"03/10/2024, 23:32:04 PT","nicki minaj's song ""big foot"" is a diss track about megan thee stallion.","Nicki Minaj's latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion's song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said 'diss track'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.","[[{'id': 'call_trQnbJbaT3p5iPBASlNNFK0s', 'function': {'arguments': '{""query"":""Nicki Minaj latest diss track 2024""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Nicki Minaj\'s latest track, titled ""Big Foot,"" has been released in early 2024 and is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many are labeling it a diss track, Nicki herself has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The song features lyrics that appear to take shots at Megan, but Nicki has pushed back against the diss track label.', 'Nicki Minaj\'s latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.']",3,1001,multi-hop,fast-changing,"['get_song_author', 'get_all_awarded_artists', 'get_song_release_date', 'get_grammy_best_artist_by_year', 'search_song_entity_by_name']"
+what's the most recent album from the founder of ysl records?,"03/05/2024, 23:28:07 PT",business is business,"Sorry, need more steps to process this request.","[[{'id': 'call_2yMQLyoFwYCt2KTxpiQD9I4k', 'function': {'arguments': '{""artist_name"":""Young Stoner Life Records""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Stoner Life"", ""Young Life"", ""Stoner"", ""Re*Cords"", ""Alyson Stoner"", ""Art Stoner"", ""Billy Stoner"", ""Brian Stoner"", ""Colin Stoner"", ""David Stoner""]}', [{'id': 'call_dBz3tCbgZz7C9jSKczWKUP5T', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Thug"", ""Young T.H.U.G."", ""Thug"", ""T.H.U.G"", ""T.H.U.G."", ""Arden Thug"", ""Babs Thug"", ""Bambie Thug"", ""blond thug"", ""Cali Thug""]}', [{'id': 'call_GbuvwSrKem9ZbExvw6gf1U8I', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', [{'id': 'call_NQ22bfeZLvSY61NxL2YP1YBb', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', 'Sorry, need more steps to process this request.']",5,500,multi-hop,slow-changing,"['get_song_author', 'search_artist_entity_by_name', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_artist_all_works']"
+what's the most recent album from the puerto rican artist that's been in wwe?,"03/10/2024, 23:31:52 PT",nadie sabe lo que va a pasar mañana,"The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.","[[{'id': 'call_b8OrYeD5gSQjBOnUjJvtg6Y8', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Bad Bunny"", ""Bunny"", ""All Bad Bunny Everything"", ""bAd bAd"", ""The Bad Bad Bad"", ""Bad"", ""B.A.D."", ""BAD!"", ""BAD"", ""Bad Bad Bird""]}', [{'id': 'call_jKkRPtMcqc8Pp9127rxo3jV9', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_count_by_artist'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_7PySlZykLTc7aHyLSi5EdMmJ', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', [{'id': 'call_21vZE6bAa9CfdWONNgwEYBAU', 'function': {'arguments': '{""query"":""Bad Bunny discography""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Bad Bunny\'s discography includes several studio albums, with his latest being ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023. This album features 22 tracks and includes collaborations with artists such as Young Miko, Feid, Arcángel, Bryant Myers, and Eladio Carrión. His previous albums include ""Un verano sin ti"" and others that have contributed to his recognition as a leading artist in the Latin music scene.', 'The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.']",6,2614,multi-hop,slow-changing,"['get_all_awarded_artists', 'get_grammy_award_count_by_artist', 'get_grammy_award_date_by_artist', 'search_artist_entity_by_name', 'get_grammy_best_artist_by_year']"
+who has played drums for the red hot chili peppers?,"03/17/2024, 17:08:10 PT","chad smith, jack irons, and cliff martinez","The drummers who have played for the Red Hot Chili Peppers include:
+
+1. Chad Smith
+2. Jack Irons
+3. Cliff Martinez
+4. D.H. Peligro
+
+These are some of the notable drummers associated with the band.","[[{'id': 'call_O5mU9etmvrZ2rpqe38tvglh8', 'function': {'arguments': '{""band_name"":""Red Hot Chili Peppers""}', 'name': 'get_members'}, 'type': 'function'}], '{""result"": [""John Frusciante"", ""Flea"", ""D.H. Peligro"", ""DeWayne \\""Blackbyrd\\"" McKnight"", ""Arik Marshall"", ""Josh Klinghoffer"", ""Chad Smith"", ""Dave Navarro"", ""Hillel Slovak"", ""Jack Irons"", ""Jack Sherman"", ""Cliff Martinez"", ""Anthony Kiedis"", ""Jesse Tobias""]}', 'The drummers who have played for the Red Hot Chili Peppers include:\n\n1. Chad Smith\n2. Jack Irons\n3. Cliff Martinez\n4. D.H. Peligro\n\nThese are some of the notable drummers associated with the band.']",2,858,set,static,"['get_members', 'get_song_author', 'get_artist_birth_place', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']"
+how many tracks are in drake's last album?,"03/12/2024, 12:28:41 PT",23,"Drake's last album, ""For All the Dogs,"" has a total of 23 tracks.","[[{'id': 'call_nXv914FUcVGCYeyW2MdmigSz', 'function': {'arguments': '{""query"":""Drake last album track count""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Drake\'s last album, ""For All the Dogs,"" has a track count of 23.', 'Drake\'s last album, ""For All the Dogs,"" has a total of 23 tracks.']",3,852,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_song_release_date', 'get_song_release_country']"
+how many number one hits did taylor swift have?,"03/12/2024, 12:28:35 PT",11,"As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.","[[{'id': 'call_AeMTBYLShMlu3PzfQztMCxns', 'function': {'arguments': '{""query"":""Taylor Swift number one hits""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'As of the Hot 100 dated February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.', 'As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.']",3,877,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'get_grammy_award_date_by_artist', 'search_song_entity_by_name']"
+where is marem ladson's original home country?,"03/21/2024, 23:41:53 PT",es,Marem Ladson's original home country is Spain (ES).,"[[{'id': 'call_Zey99OPDixDXc3nUHZnD5MVC', 'function': {'arguments': '{""artist_name"":""Marem Ladson""}', 'name': 'get_artist_birth_place'}, 'type': 'function'}], '{""result"": ""ES""}', ""Marem Ladson's original home country is Spain (ES).""]",2,716,simple,static,"['get_artist_birth_place', 'get_song_release_country', 'get_song_author', 'get_artist_birth_date', 'search_song_entity_by_name']"
+tell me the most recent song or album by doris duke?,"03/21/2024, 23:40:20 PT",i’m a loser: the swamp dogg sessions... and more,"The most recent album by Doris Duke is ""I'm A Loser,"" which was released on February 4, 2021.","[[{'id': 'call_IBn6alF1JcL2QNYxDvkd9Nla', 'function': {'arguments': '{""song_name"":""Doris Duke""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Passion Flower (For Doris Duke)"", ""Doris"", ""Black Doris"", ""Clara / Doris"", ""Doris 100"", ""Doris Day"", ""Doris Machin"", ""Doris & Me"", ""Doris Rising"", ""Doris Troy""]}', [{'id': 'call_rfPHKHEPZ1UntCcHBZBOeQtM', 'function': {'arguments': '{""query"":""Doris Duke most recent song or album""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Doris Duke\'s most recent album is ""I\'m A Loser,"" which was released on February 4, 2021.', 'The most recent album by Doris Duke is ""I\'m A Loser,"" which was released on February 4, 2021.']",4,1418,simple_w_condition,fast-changing,"['search_song_entity_by_name', 'get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_artist']"
+what song topped the billboard chart on 2004-02-04?,"03/21/2024, 23:34:49 PT",hey ya!,"The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.","[[{'id': 'call_nwInngToFLLUFMeKYuR5lwhx', 'function': {'arguments': '{""rank"":1,""date"":""2004-02-04""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Hey Ya!""], [""OutKast""]]}', 'The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.']",2,794,simple_w_condition,fast-changing,"['get_billboard_attributes', 'get_song_release_date', 'search_song_entity_by_name', 'get_song_release_country', 'get_billboard_rank_date']"
+what is the most popular song on billboard in 2024-02-28?,"03/21/2024, 23:34:42 PT",texas hold 'em,"The most popular song on Billboard on February 28, 2024, was ""Texas Hold 'Em"" by Beyonce.","[[{'id': 'call_ctgHbLk8zIctCE0qOgjtcTU7', 'function': {'arguments': '{""rank"":1,""date"":""2024-02-28""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Texas Hold \'Em""], [""Beyonce""]]}', 'The most popular song on Billboard on February 28, 2024, was ""Texas Hold \'Em"" by Beyonce.']",2,798,simple_w_condition,fast-changing,"['get_song_release_date', 'get_billboard_attributes', 'get_song_release_country', 'search_song_entity_by_name', 'get_billboard_rank_date']"
+"what album did maroon five release in 2010, which included the songs ""moves like jagger"" and ""misery""?","03/13/2024, 09:49:01 PT","maroon five released the album ""hands all over"" in 2010, which included the songs ""moves like jagger"" and ""misery"".","Sorry, need more steps to process this request.","[[{'id': 'call_TiKh63jRcIYXYAsXxrzUB0Ip', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Moves Like Jagger"", ""Moves Like Jagger (remix)"", ""Moves Like Jagger (Remixes)"", ""D.N.A. Moves Like Jagger Mashup"", ""Moves Like Jagger / Jumpin\' Jack Flash"", ""Moves Like Jagger (The Voice Performance)"", ""Move Like Jagger"", ""Foxy Jagger"", ""Jagger \'67"", ""Peter Jagger""]}', [{'id': 'call_KlfgNIKNzIjm9qAwpWXOkl5n', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Misery"", ""MISERY"", ""misery"", ""Misery..."", ""Artificial Misery"", ""Beautiful Misery"", ""Blissful Misery"", ""Burn // Misery"", ""Collected Misery"", ""Concret\\u00e9 Misery""]}', [{'id': 'call_NePkutCu9BVaeeMEV2xJNXhE', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_0fGXDvgbkvbEHMnRNBdIfK7i', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', 'Sorry, need more steps to process this request.']",5,500,simple_w_condition,static,"['get_song_release_date', 'get_song_release_country', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_song', 'search_song_entity_by_name']"

From 1b49b1b6a05f56d3adb9eb42f61c022f49090d67 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 10:17:05 -0700
Subject: [PATCH 06/18] docker compose for tgi gaudi

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../docker-compose-llm-judge-gaudi.yaml       | 22 +++++++++++++++++++
 .../llm_judge/launch_llm_judge_endpoint.sh    |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml

diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
new file mode 100644
index 00000000..a3eb311e
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
@@ -0,0 +1,22 @@
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:latest
+    container_name: tgi-gaudi-server
+    ports:
+      - "8085:80"
+    volumes:
+      - ${HF_CACHE_DIR}:/data
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
index d927cc05..56ce8ab9 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
@@ -1,4 +1,4 @@
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" #
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" #
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
-docker compose -f docker-compose-llm-judge.yaml up -d
\ No newline at end of file
+docker compose -f docker-compose-llm-judge-gaudi.yaml up -d
\ No newline at end of file

From 7867461208b4abe4e68ac5912c5130f03f406b33 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 10:24:24 -0700
Subject: [PATCH 07/18] fix tgi gaudi docker compose for llama3 70b

---
 .../llm_judge/docker-compose-llm-judge-gaudi.yaml            | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
index a3eb311e..3bed748f 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
@@ -1,7 +1,7 @@
 services:
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:latest
-    container_name: tgi-gaudi-server
+    container_name: tgi-server
     ports:
       - "8085:80"
     volumes:
@@ -15,8 +15,9 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
+      PT_HPU_ENABLE_LAZY_COLLECTIVES: true
     runtime: habana
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
\ No newline at end of file
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --sharded true --num-shard 4
\ No newline at end of file

From 8e999dcd420f1a6be92fd409da78e5f2ebcaaa6f Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 26 Aug 2024 11:33:05 -0700
Subject: [PATCH 08/18] update llm eval code

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 evals/evaluation/crag_eval/docker/Dockerfile  |  3 +-
 .../crag_eval/docker/requirements.txt         |  1 +
 .../crag_eval/run_benchmark/grade_answers.py  | 32 ++++++++++++++++---
 evals/metrics/ragas/ragas.py                  | 11 +++++--
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/crag_eval/docker/Dockerfile
index 0421000a..a3a97c5b 100644
--- a/evals/evaluation/crag_eval/docker/Dockerfile
+++ b/evals/evaluation/crag_eval/docker/Dockerfile
@@ -9,7 +9,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     wget \
     git \
     poppler-utils \
-    libmkl-dev
+    libmkl-dev \
+    curl
 
 COPY requirements.txt /home/user/requirements.txt
 
diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt
index 662f3146..4579eca7 100644
--- a/evals/evaluation/crag_eval/docker/requirements.txt
+++ b/evals/evaluation/crag_eval/docker/requirements.txt
@@ -2,6 +2,7 @@ pandas
 datasets
 ragas
 langchain-community
+langchain-huggingface
 evaluate
 jieba
 sentence_transformers
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index 21c16db3..a3d08322 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -17,6 +17,22 @@ def convert_data_format_for_ragas(data):
     return output
 
 
+def make_list_of_test_cases(data):
+    # data: pandas dataframe
+    # columns: ['query', 'answer', 'ref_answer']
+    # return: a dict with keys: 'input', 'actual_output', 'expected_output'
+    output = []
+    for _, row in data.iterrows():
+        output.append(
+            {
+                'input': [row['query']],
+                'actual_output': [row['answer']],
+                'expected_output': [row['ref_answer']],
+                'retrieval_context': [["dummy_context"]]
+            }
+        )
+    return output
+
 def grade_answers(args, test_case):
     from langchain_community.embeddings import HuggingFaceBgeEmbeddings
     print('==============getting embeddings==============')
@@ -27,8 +43,14 @@ def grade_answers(args, test_case):
                          model= args.llm_endpoint, 
                          embeddings=embeddings)
     print('==============start grading==============')
-    metric.measure(test_case)
-    print(metric.score)
+    scores = []
+    for case in test_case:
+        metric.measure(case)
+        scores.append(metric.score)
+        print(metric.score)
+        print('-'*50)
+    # metric.measure(test_case)
+    return scores
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -41,7 +63,9 @@ def grade_answers(args, test_case):
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
     data = data.head(2)
     print(data)
-    test_case = convert_data_format_for_ragas(data)
+    # test_case = convert_data_format_for_ragas(data)
+    test_case = make_list_of_test_cases(data)
     print(test_case)
-    grade_answers(args, test_case)
+    scores = grade_answers(args, test_case)
+    print(scores)
 
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 945b8671..ac717d92 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -7,7 +7,7 @@
 import os
 from typing import Dict, Optional, Union
 
-from langchain_community.llms import HuggingFaceEndpoint
+from langchain_huggingface import HuggingFaceEndpoint
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
 
@@ -82,10 +82,15 @@ def measure(self, test_case: Dict):
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
             self.model = None
         if isinstance(self.model, str):
+            print('LLM endpoint: ', self.model)
             chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
-                timeout=600,
+                task="text-generation",
+                max_new_tokens=1024,
+                do_sample=False,
             )
+            print('Validating LLM endpoint....')
+            chat_model.invoke("Hello!")
         else:
             chat_model = self.model
         # Create a dataset from the test case
@@ -128,7 +133,7 @@ def measure(self, test_case: Dict):
             llm=chat_model,
             embeddings=self.embeddings,
         )
-        print(self.score)
+        # print(self.score)
         return self.score
 
     def is_successful(self):

From 7b9b9b204686e68ab2cb09532ec85bf7560b46c6 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 27 Aug 2024 10:46:32 -0700
Subject: [PATCH 09/18] allow per sample grading

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../crag_eval/run_benchmark/grade_answers.py  | 30 +++++++-----
 .../docker-compose-llm-judge-gaudi.yaml       |  2 +-
 .../llm_judge/launch_llm_judge_endpoint.sh    |  2 +-
 .../llm_judge/test_llm_endpoint.py            | 15 ++++++
 evals/metrics/ragas/ragas.py                  | 46 ++++++++-----------
 5 files changed, 56 insertions(+), 39 deletions(-)
 create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py

diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index a3d08322..1741354c 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -20,7 +20,7 @@ def convert_data_format_for_ragas(data):
 def make_list_of_test_cases(data):
     # data: pandas dataframe
     # columns: ['query', 'answer', 'ref_answer']
-    # return: a dict with keys: 'input', 'actual_output', 'expected_output'
+    # return: a list of dicts with keys: 'input', 'actual_output', 'expected_output'
     output = []
     for _, row in data.iterrows():
         output.append(
@@ -43,14 +43,18 @@ def grade_answers(args, test_case):
                          model= args.llm_endpoint, 
                          embeddings=embeddings)
     print('==============start grading==============')
-    scores = []
-    for case in test_case:
-        metric.measure(case)
-        scores.append(metric.score)
-        print(metric.score)
-        print('-'*50)
-    # metric.measure(test_case)
-    return scores
+
+    if args.batch_grade:
+        metric.measure(test_case)
+        return metric.score
+    else:
+        scores = []
+        for case in test_case:
+            metric.measure(case)
+            scores.append(metric.score)
+            print(metric.score)
+            print('-'*50)
+        return scores
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -58,13 +62,17 @@ def grade_answers(args, test_case):
     parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008")
     parser.add_argument("--filedir", type=str, help="Path to the file containing the data")
     parser.add_argument("--filename", type=str, help="Name of the file containing the data")
+    parser.add_argument("--batch_grade", action="store_true", help="Grade the answers in batch and get an aggregated score for the entire dataset")
     args = parser.parse_args()
 
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
     data = data.head(2)
     print(data)
-    # test_case = convert_data_format_for_ragas(data)
-    test_case = make_list_of_test_cases(data)
+    if args.batch_grade:
+        test_case = convert_data_format_for_ragas(data)
+    else:
+        test_case = make_list_of_test_cases(data)
+    
     print(test_case)
     scores = grade_answers(args, test_case)
     print(scores)
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
index 3bed748f..56b1e6dd 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
@@ -20,4 +20,4 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --sharded true --num-shard 4
\ No newline at end of file
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
index 56ce8ab9..5b13d01d 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
@@ -1,4 +1,4 @@
-export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" #
+export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct"
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
 docker compose -f docker-compose-llm-judge-gaudi.yaml up -d
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
new file mode 100644
index 00000000..ce961173
--- /dev/null
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
@@ -0,0 +1,15 @@
+from langchain_huggingface import HuggingFaceEndpoint
+import os
+
+host_ip=os.environ.get("host_ip", "localhost")
+url = "http://{host_ip}:8085".format(host_ip=host_ip)
+print(url)
+
+model = HuggingFaceEndpoint(
+    endpoint_url=url,
+    task="text-generation",
+    max_new_tokens=10,
+    do_sample=False,
+)
+
+print(model.invoke("what is deep learing?"))
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index ac717d92..d8e218dc 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -54,28 +54,11 @@ def __init__(
         self.metrics = metrics
         self.validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"]
 
-    async def a_measure(self, test_case: Dict):
-        return self.measure(test_case)
-
-    def measure(self, test_case: Dict):
-
-        # sends to server
         try:
-            from ragas import evaluate
             from ragas.metrics import answer_relevancy, faithfulness, answer_correctness
-
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.")
 
-        try:
-            from datasets import Dataset
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError("Please install dataset")
-        self.metrics_instance = {
-            "answer_relevancy": answer_relevancy,
-            "faithfulness": faithfulness,
-        }
-
         # Set LLM model
         openai_key = os.getenv("OPENAI_API_KEY", None)
         if openai_key is not None:
@@ -83,18 +66,16 @@ def measure(self, test_case: Dict):
             self.model = None
         if isinstance(self.model, str):
             print('LLM endpoint: ', self.model)
-            chat_model = HuggingFaceEndpoint(
+            self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
                 task="text-generation",
                 max_new_tokens=1024,
                 do_sample=False,
             )
-            print('Validating LLM endpoint....')
-            chat_model.invoke("Hello!")
         else:
-            chat_model = self.model
-        # Create a dataset from the test case
-        # Convert the Dict to a format compatible with Dataset
+            self.chat_model = self.model
+        
+        # initialize metrics
         if self.metrics is not None:
             tmp_metrics = []
             # check supported list
@@ -113,12 +94,26 @@ def measure(self, test_case: Dict):
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(get_metric(metric))
             self.metrics = tmp_metrics
-        else:
+        else: # default metrics
             self.metrics = [
                 answer_relevancy,
                 faithfulness,
+                answer_correctness,
             ]
 
+
+    async def a_measure(self, test_case: Dict):
+        return self.measure(test_case)
+
+    def measure(self, test_case: Dict):
+        from ragas import evaluate
+        try:
+            from datasets import Dataset
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError("Please install dataset")
+        
+        # Create a dataset from the test case
+        # Convert the Dict to a format compatible with Dataset
         data = {
             "question": test_case["input"],
             "contexts": test_case["retrieval_context"],
@@ -130,10 +125,9 @@ def measure(self, test_case: Dict):
         self.score = evaluate(
             dataset,
             metrics=self.metrics,
-            llm=chat_model,
+            llm=self.chat_model,
             embeddings=self.embeddings,
         )
-        # print(self.score)
         return self.score
 
     def is_successful(self):

From 5c58b729065a9f60e810c7bf80d218c4b84af1bd Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 27 Aug 2024 10:56:18 -0700
Subject: [PATCH 10/18] save graded scores

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../crag_eval/run_benchmark/grade_answers.py  | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index 1741354c..3aaaba5c 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -46,12 +46,12 @@ def grade_answers(args, test_case):
 
     if args.batch_grade:
         metric.measure(test_case)
-        return metric.score
+        return metric.score['answer_correctness']
     else:
         scores = []
         for case in test_case:
             metric.measure(case)
-            scores.append(metric.score)
+            scores.append(metric.score['answer_correctness'])
             print(metric.score)
             print('-'*50)
         return scores
@@ -66,7 +66,7 @@ def grade_answers(args, test_case):
     args = parser.parse_args()
 
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
-    data = data.head(2)
+    # data = data.head(2)
     print(data)
     if args.batch_grade:
         test_case = convert_data_format_for_ragas(data)
@@ -74,6 +74,17 @@ def grade_answers(args, test_case):
         test_case = make_list_of_test_cases(data)
     
     print(test_case)
+
     scores = grade_answers(args, test_case)
-    print(scores)
+
+    # save the scores
+    if args.batch_grade:
+        print("Aggregated answer correctness score: ", scores)
+    else:
+        data['answer_correctness'] = scores
+        output_file = args.filename.split('.')[0] + '_graded.csv'
+        data.to_csv(os.path.join(args.filedir, output_file), index=False)
+        print("Scores saved to ", os.path.join(args.filedir, args.output))
+        
+
 

From d043dded61f9b89d59d96bb774928e5917be711c Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 27 Aug 2024 11:30:27 -0700
Subject: [PATCH 11/18] ipdate readme

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 evals/evaluation/crag_eval/README.md          | 24 ++++++++--------
 .../preprocess_data/run_data_preprocess.sh    |  2 +-
 .../preprocess_data/run_sample_data.sh        |  2 +-
 .../crag_eval/run_benchmark/grade_answers.py  |  8 +++---
 .../run_benchmark/run_generate_answer.sh      |  8 +++---
 .../crag_eval/run_benchmark/run_grading.sh    |  2 +-
 .../run_benchmark/sample_answers.csv          | 28 -------------------
 7 files changed, 24 insertions(+), 50 deletions(-)
 delete mode 100644 evals/evaluation/crag_eval/run_benchmark/sample_answers.csv

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
index aa7097be..a277c136 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/crag_eval/README.md
@@ -50,10 +50,7 @@ The CRAG dataset has more than 4000 queries, and running all of them can be very
 ```
 bash run_sample_data.sh
 ```
-4. Use the small subset that we have processed for a quick run
-```
-Small data files in this repo
-```
+
 ## Launch agent QnA system
 Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
 1. Build images
@@ -77,20 +74,19 @@ bash 4_launch_and_validate_agent.sh
 ```
 
 ## Run CRAG benchmark
-Once you have your agent system up and running, you can follow the steps below to run the benchmark.
-1. Generate answers with agent
-Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
+Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
 ```
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark
 bash run_generate_answer.sh
 ```
-2. Use LLM-as-judge to grade the answers
-First, in another terminal, launch llm endpoint with HF TGI
+
+## Use LLM-as-judge to grade the answers
+1. Launch llm endpoint with HF TGI: in another terminal, run the command below. By default, `meta-llama/Meta-Llama-3-70B-Instruct` is used as the LLM judge.
 ```
 cd llm_judge
 bash launch_llm_judge_endpoint.sh
 ```
-Validate that the llm endpoint is working properly.
+2. Validate that the llm endpoint is working properly.
 ```
 export host_ip=$(hostname -I | awk '{print $1}')
 curl ${host_ip}:8085/generate_stream \
@@ -98,8 +94,14 @@ curl ${host_ip}:8085/generate_stream \
     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \
     -H 'Content-Type: application/json'
 ```
-Second, back to the interactive crag-eval docker, run command below
+And then go back to the interactive crag-eval docker, run command below.
+```
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/
+python3 test_llm_endpoint.py
+```
+3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py).
 ```
+cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/
 bash run_grading.sh
 ```
 
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
index 8ecfa94f..fc6f19be 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
@@ -2,4 +2,4 @@ FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4
 DOCOUT=$WORKDIR/datasets/crag_docs
 QAOUT=$WORKDIR/datasets/crag_qas
 
-python process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
\ No newline at end of file
+python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
index f0bce763..06514702 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
@@ -1,3 +1,3 @@
 FILEDIR=$WORKDIR/datasets/crag_docs
 
-python sample_data.py --filedir $FILEDIR
\ No newline at end of file
+python3 sample_data.py --filedir $FILEDIR
\ No newline at end of file
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index 3aaaba5c..ddeecdb6 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -66,14 +66,13 @@ def grade_answers(args, test_case):
     args = parser.parse_args()
 
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
-    # data = data.head(2)
-    print(data)
+
     if args.batch_grade:
         test_case = convert_data_format_for_ragas(data)
     else:
         test_case = make_list_of_test_cases(data)
     
-    print(test_case)
+    # print(test_case)
 
     scores = grade_answers(args, test_case)
 
@@ -82,9 +81,10 @@ def grade_answers(args, test_case):
         print("Aggregated answer correctness score: ", scores)
     else:
         data['answer_correctness'] = scores
+        print("Average answer correctness score: ", data['answer_correctness'].mean())
         output_file = args.filename.split('.')[0] + '_graded.csv'
         data.to_csv(os.path.join(args.filedir, output_file), index=False)
-        print("Scores saved to ", os.path.join(args.filedir, args.output))
+        print("Scores saved to ", os.path.join(args.filedir, output_file))
         
 
 
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
index 24b477ee..d4529fdb 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
@@ -1,13 +1,13 @@
-host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the service
-port=9095 # change this to the port of the service
-endpoint=${port}/v1/chat/completions # change this to the endpoint of the service
+host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent
+port=9095 # change this to the port of the agent
+endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent
 URL="http://${host_ip}:${endpoint}"
 echo "AGENT ENDPOINT URL: ${URL}"
 
 QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl
 OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl
 
-python generate_answers.py \
+python3 generate_answers.py \
 --endpoint_url ${URL} \
 --query_file $QUERYFILE \
 --output_file $OUTPUTFILE
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
index ff5a22f7..82cae435 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
@@ -1,5 +1,5 @@
 FILEDIR=$WORKDIR/datasets/crag_results/
-FILENAME=crag_20queries_react_docgradertool_top5apis_v2sysm_gpt4omini.csv
+FILENAME=results.csv
 LLM_ENDPOINT=http://${host_ip}:8085
 
 python3 grade_answers.py \
diff --git a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv
deleted file mode 100644
index 790f6f6d..00000000
--- a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv
+++ /dev/null
@@ -1,28 +0,0 @@
-query,query_time,ref_answer,answer,context,num_llm_calls,total_tokens,question_type,static_or_dynamic,selected_tools
-how many reading and leeds festivals has the band foo fighters headlined?,"03/13/2024, 09:55:12 PT",foo fighters have headlined a total of 4 reading and leeds festivals.,"The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.","[[{'id': 'call_05BuB1KtO8uuJypQh6lecBVM', 'function': {'arguments': '{""query"":""Foo Fighters Reading and Leeds Festival headlining history""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Foo Fighters have a notable history of headlining the Reading and Leeds Festivals. They first headlined the festival in 2002 and returned to headline again in 2005. After a significant gap, they headlined once more in 2012. In 2019, they were announced as one of the headliners again, marking their return to the festival after several years. Overall, they have been a prominent presence at the festival since their first appearance in 1995.', 'The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.']",3,959,aggregation,slow-changing,"['get_members', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist']"
-how many songs has the band the beatles released that have been recorded at abbey road studios?,"03/13/2024, 09:54:32 PT",the beatles have released 190 songs that have been recorded at abbey road studios.,"The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.","[[{'id': 'call_wthw0YH3cxU3yzYMdhfyY2OI', 'function': {'arguments': '{""query"":""The Beatles songs recorded at Abbey Road Studios""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Beatles recorded most of their music at Abbey Road Studios, formerly known as EMI Recording Studios, from their inception in 1957 until their break-up in 1970. This includes iconic albums such as ""Abbey Road,"" ""Sgt. Pepper\'s Lonely Hearts Club Band,"" and ""The White Album."" The studio is renowned for its cultural and historical significance in the music industry, particularly for the Beatles\' extensive catalog.', 'The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.']",3,968,aggregation,static,"['get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_release_country', 'get_members', 'search_song_entity_by_name']"
-did the song the globe come out before the wedding?,"03/21/2024, 23:32:29 PT",yes,"The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""","[[{'id': 'call_7wKEamOXFKVmki6gklVACm0M', 'function': {'arguments': '{""song_name"": ""The Globe""}', 'name': 'get_song_release_date'}, 'type': 'function'}, {'id': 'call_rfOh7EoCSkyDRGPe7DvgKMFE', 'function': {'arguments': '{""song_name"": ""The Wedding""}', 'name': 'get_song_release_date'}, 'type': 'function'}], '{""result"": ""1991""}', '{""result"": ""2016-09-23""}', 'The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""']",2,839,comparison,static,"['get_song_release_country', 'search_song_entity_by_name', 'get_song_release_date', 'get_grammy_best_song_by_year', 'get_song_author']"
-"who has had more number one hits on the us billboard hot 100 chart, michael jackson or elvis presley?","03/13/2024, 09:43:49 PT","elvis presley has had more number one hits on the us billboard hot 100 chart than michael jackson, with 18 number one hits compared to jackson's 13.","Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.","[[{'id': 'call_i5H8SAsGgRTXnvI0sDm5wmrv', 'function': {'arguments': '{""query"": ""Michael Jackson number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}, {'id': 'call_Mgec4kbeE0z4h3QhJ0ENNTHU', 'function': {'arguments': '{""query"": ""Elvis Presley number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Michael Jackson had 13 number-one hits on the US Billboard Hot 100.', 'Elvis Presley had 18 number one hits on the US Billboard Hot 100 chart. His number one singles spent a total of 81 weeks at the top position, which is the most of any artist.', ""Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.""]",4,1136,comparison,static,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'search_song_entity_by_name', 'get_song_release_country']"
-when was alex van halen the drummer for the band santana?,"03/13/2024, 10:04:31 PT",invalid question,Invalid question.,['Invalid question.'],1,343,false_premise,static,"['get_members', 'get_song_author', 'get_song_release_date', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_song']"
-"when did dolly parton's song, blown away, come out?","03/10/2024, 23:28:06 PT",invalid question,"Carrie Underwood's song ""Blown Away"" was released on May 1, 2012.","[[{'id': 'call_7ZIr0njVYoqlfvoCdd6pLmHb', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_release_country'}, 'type': 'function'}], '{""result"": ""ID""}', [{'id': 'call_ZUYqfFFb4VRGVHwhrG3nsV8L', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_author'}, 'type': 'function'}], '{""result"": ""Carrie Underwood""}', [{'id': 'call_mA2bAMtysJClId77OoUan9yO', 'function': {'arguments': '{""query"":""Blown Away Carrie Underwood release date""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Carrie Underwood\'s album ""Blown Away"" was released on May 1, 2012.', 'Carrie Underwood\'s song ""Blown Away"" was released on May 1, 2012.']",5,1798,false_premise,static,"['get_grammy_award_count_by_song', 'get_song_author', 'get_song_release_country', 'get_grammy_best_song_by_year', 'get_grammy_award_date_by_artist']"
-what grammy award did edgar barrera win this year?,"03/05/2024, 23:25:22 PT",invalid question,"Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.","[[{'id': 'call_ptgqp9zavnUvzSqmBNPNa8R3', 'function': {'arguments': '{""artist_name"":""Edgar Barrera""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": [2014]}', 'Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.']",2,828,false_premise,static,"['get_grammy_award_date_by_artist', 'get_grammy_best_artist_by_year', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_artist', 'get_grammy_best_song_by_year']"
-when did miley cyrus win grammy best new artist award?,"03/21/2024, 23:38:14 PT",invalid question,Miley Cyrus has not won the Grammy Best New Artist award.,"[[{'id': 'call_wCWNt9LOduR5IESTc1jeB6n4', 'function': {'arguments': '{""artist_name"":""Miley Cyrus""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', 'Miley Cyrus has not won the Grammy Best New Artist award.']",2,790,false_premise,static,"['get_all_awarded_artists', 'get_grammy_best_artist_by_year', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']"
-when was stevie wonder the keyboardist for the band the rolling stones?,"03/13/2024, 10:01:54 PT",invalid question,Invalid question.,['Invalid question.'],1,355,false_premise,static,"['get_grammy_award_date_by_artist', 'get_artist_birth_date', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song', 'get_artist_birth_place']"
-who is nicki minaj's latest diss track about?,"03/10/2024, 23:32:04 PT","nicki minaj's song ""big foot"" is a diss track about megan thee stallion.","Nicki Minaj's latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion's song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said 'diss track'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.","[[{'id': 'call_trQnbJbaT3p5iPBASlNNFK0s', 'function': {'arguments': '{""query"":""Nicki Minaj latest diss track 2024""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Nicki Minaj\'s latest track, titled ""Big Foot,"" has been released in early 2024 and is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many are labeling it a diss track, Nicki herself has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The song features lyrics that appear to take shots at Megan, but Nicki has pushed back against the diss track label.', 'Nicki Minaj\'s latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.']",3,1001,multi-hop,fast-changing,"['get_song_author', 'get_all_awarded_artists', 'get_song_release_date', 'get_grammy_best_artist_by_year', 'search_song_entity_by_name']"
-what's the most recent album from the founder of ysl records?,"03/05/2024, 23:28:07 PT",business is business,"Sorry, need more steps to process this request.","[[{'id': 'call_2yMQLyoFwYCt2KTxpiQD9I4k', 'function': {'arguments': '{""artist_name"":""Young Stoner Life Records""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Stoner Life"", ""Young Life"", ""Stoner"", ""Re*Cords"", ""Alyson Stoner"", ""Art Stoner"", ""Billy Stoner"", ""Brian Stoner"", ""Colin Stoner"", ""David Stoner""]}', [{'id': 'call_dBz3tCbgZz7C9jSKczWKUP5T', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Thug"", ""Young T.H.U.G."", ""Thug"", ""T.H.U.G"", ""T.H.U.G."", ""Arden Thug"", ""Babs Thug"", ""Bambie Thug"", ""blond thug"", ""Cali Thug""]}', [{'id': 'call_GbuvwSrKem9ZbExvw6gf1U8I', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', [{'id': 'call_NQ22bfeZLvSY61NxL2YP1YBb', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', 'Sorry, need more steps to process this request.']",5,500,multi-hop,slow-changing,"['get_song_author', 'search_artist_entity_by_name', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_artist_all_works']"
-what's the most recent album from the puerto rican artist that's been in wwe?,"03/10/2024, 23:31:52 PT",nadie sabe lo que va a pasar mañana,"The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.","[[{'id': 'call_b8OrYeD5gSQjBOnUjJvtg6Y8', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Bad Bunny"", ""Bunny"", ""All Bad Bunny Everything"", ""bAd bAd"", ""The Bad Bad Bad"", ""Bad"", ""B.A.D."", ""BAD!"", ""BAD"", ""Bad Bad Bird""]}', [{'id': 'call_jKkRPtMcqc8Pp9127rxo3jV9', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_count_by_artist'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_7PySlZykLTc7aHyLSi5EdMmJ', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', [{'id': 'call_21vZE6bAa9CfdWONNgwEYBAU', 'function': {'arguments': '{""query"":""Bad Bunny discography""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Bad Bunny\'s discography includes several studio albums, with his latest being ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023. This album features 22 tracks and includes collaborations with artists such as Young Miko, Feid, Arcángel, Bryant Myers, and Eladio Carrión. His previous albums include ""Un verano sin ti"" and others that have contributed to his recognition as a leading artist in the Latin music scene.', 'The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.']",6,2614,multi-hop,slow-changing,"['get_all_awarded_artists', 'get_grammy_award_count_by_artist', 'get_grammy_award_date_by_artist', 'search_artist_entity_by_name', 'get_grammy_best_artist_by_year']"
-who has played drums for the red hot chili peppers?,"03/17/2024, 17:08:10 PT","chad smith, jack irons, and cliff martinez","The drummers who have played for the Red Hot Chili Peppers include:
-
-1. Chad Smith
-2. Jack Irons
-3. Cliff Martinez
-4. D.H. Peligro
-
-These are some of the notable drummers associated with the band.","[[{'id': 'call_O5mU9etmvrZ2rpqe38tvglh8', 'function': {'arguments': '{""band_name"":""Red Hot Chili Peppers""}', 'name': 'get_members'}, 'type': 'function'}], '{""result"": [""John Frusciante"", ""Flea"", ""D.H. Peligro"", ""DeWayne \\""Blackbyrd\\"" McKnight"", ""Arik Marshall"", ""Josh Klinghoffer"", ""Chad Smith"", ""Dave Navarro"", ""Hillel Slovak"", ""Jack Irons"", ""Jack Sherman"", ""Cliff Martinez"", ""Anthony Kiedis"", ""Jesse Tobias""]}', 'The drummers who have played for the Red Hot Chili Peppers include:\n\n1. Chad Smith\n2. Jack Irons\n3. Cliff Martinez\n4. D.H. Peligro\n\nThese are some of the notable drummers associated with the band.']",2,858,set,static,"['get_members', 'get_song_author', 'get_artist_birth_place', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']"
-how many tracks are in drake's last album?,"03/12/2024, 12:28:41 PT",23,"Drake's last album, ""For All the Dogs,"" has a total of 23 tracks.","[[{'id': 'call_nXv914FUcVGCYeyW2MdmigSz', 'function': {'arguments': '{""query"":""Drake last album track count""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Drake\'s last album, ""For All the Dogs,"" has a track count of 23.', 'Drake\'s last album, ""For All the Dogs,"" has a total of 23 tracks.']",3,852,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_song_release_date', 'get_song_release_country']"
-how many number one hits did taylor swift have?,"03/12/2024, 12:28:35 PT",11,"As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.","[[{'id': 'call_AeMTBYLShMlu3PzfQztMCxns', 'function': {'arguments': '{""query"":""Taylor Swift number one hits""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'As of the Hot 100 dated February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.', 'As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.']",3,877,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'get_grammy_award_date_by_artist', 'search_song_entity_by_name']"
-where is marem ladson's original home country?,"03/21/2024, 23:41:53 PT",es,Marem Ladson's original home country is Spain (ES).,"[[{'id': 'call_Zey99OPDixDXc3nUHZnD5MVC', 'function': {'arguments': '{""artist_name"":""Marem Ladson""}', 'name': 'get_artist_birth_place'}, 'type': 'function'}], '{""result"": ""ES""}', ""Marem Ladson's original home country is Spain (ES).""]",2,716,simple,static,"['get_artist_birth_place', 'get_song_release_country', 'get_song_author', 'get_artist_birth_date', 'search_song_entity_by_name']"
-tell me the most recent song or album by doris duke?,"03/21/2024, 23:40:20 PT",i’m a loser: the swamp dogg sessions... and more,"The most recent album by Doris Duke is ""I'm A Loser,"" which was released on February 4, 2021.","[[{'id': 'call_IBn6alF1JcL2QNYxDvkd9Nla', 'function': {'arguments': '{""song_name"":""Doris Duke""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Passion Flower (For Doris Duke)"", ""Doris"", ""Black Doris"", ""Clara / Doris"", ""Doris 100"", ""Doris Day"", ""Doris Machin"", ""Doris & Me"", ""Doris Rising"", ""Doris Troy""]}', [{'id': 'call_rfPHKHEPZ1UntCcHBZBOeQtM', 'function': {'arguments': '{""query"":""Doris Duke most recent song or album""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Doris Duke\'s most recent album is ""I\'m A Loser,"" which was released on February 4, 2021.', 'The most recent album by Doris Duke is ""I\'m A Loser,"" which was released on February 4, 2021.']",4,1418,simple_w_condition,fast-changing,"['search_song_entity_by_name', 'get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_artist']"
-what song topped the billboard chart on 2004-02-04?,"03/21/2024, 23:34:49 PT",hey ya!,"The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.","[[{'id': 'call_nwInngToFLLUFMeKYuR5lwhx', 'function': {'arguments': '{""rank"":1,""date"":""2004-02-04""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Hey Ya!""], [""OutKast""]]}', 'The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.']",2,794,simple_w_condition,fast-changing,"['get_billboard_attributes', 'get_song_release_date', 'search_song_entity_by_name', 'get_song_release_country', 'get_billboard_rank_date']"
-what is the most popular song on billboard in 2024-02-28?,"03/21/2024, 23:34:42 PT",texas hold 'em,"The most popular song on Billboard on February 28, 2024, was ""Texas Hold 'Em"" by Beyonce.","[[{'id': 'call_ctgHbLk8zIctCE0qOgjtcTU7', 'function': {'arguments': '{""rank"":1,""date"":""2024-02-28""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Texas Hold \'Em""], [""Beyonce""]]}', 'The most popular song on Billboard on February 28, 2024, was ""Texas Hold \'Em"" by Beyonce.']",2,798,simple_w_condition,fast-changing,"['get_song_release_date', 'get_billboard_attributes', 'get_song_release_country', 'search_song_entity_by_name', 'get_billboard_rank_date']"
-"what album did maroon five release in 2010, which included the songs ""moves like jagger"" and ""misery""?","03/13/2024, 09:49:01 PT","maroon five released the album ""hands all over"" in 2010, which included the songs ""moves like jagger"" and ""misery"".","Sorry, need more steps to process this request.","[[{'id': 'call_TiKh63jRcIYXYAsXxrzUB0Ip', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Moves Like Jagger"", ""Moves Like Jagger (remix)"", ""Moves Like Jagger (Remixes)"", ""D.N.A. Moves Like Jagger Mashup"", ""Moves Like Jagger / Jumpin\' Jack Flash"", ""Moves Like Jagger (The Voice Performance)"", ""Move Like Jagger"", ""Foxy Jagger"", ""Jagger \'67"", ""Peter Jagger""]}', [{'id': 'call_KlfgNIKNzIjm9qAwpWXOkl5n', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Misery"", ""MISERY"", ""misery"", ""Misery..."", ""Artificial Misery"", ""Beautiful Misery"", ""Blissful Misery"", ""Burn // Misery"", ""Collected Misery"", ""Concret\\u00e9 Misery""]}', [{'id': 'call_NePkutCu9BVaeeMEV2xJNXhE', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_0fGXDvgbkvbEHMnRNBdIfK7i', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', 'Sorry, need more steps to process this request.']",5,500,simple_w_condition,static,"['get_song_release_date', 'get_song_release_country', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_song', 'search_song_entity_by_name']"

From 9556f9a189ad51e3fbfcafc7bcb551aceb3d812f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 21:31:38 +0000
Subject: [PATCH 12/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/evaluation/crag_eval/README.md          |  1 -
 .../crag_eval/docker/build_image.sh           |  5 +-
 .../crag_eval/docker/launch_eval_container.sh |  5 +-
 .../crag_eval/docker/requirements.txt         | 10 +--
 .../crag_eval/preprocess_data/process_data.py | 58 +++++++++-------
 .../preprocess_data/run_data_preprocess.sh    |  5 +-
 .../preprocess_data/run_sample_data.sh        |  5 +-
 .../crag_eval/preprocess_data/sample_data.py  | 24 ++++---
 .../run_benchmark/generate_answers.py         | 55 ++++++++-------
 .../crag_eval/run_benchmark/grade_answers.py  | 67 ++++++++++---------
 .../docker-compose-llm-judge-gaudi.yaml       |  5 +-
 .../llm_judge/docker-compose-llm-judge.yaml   |  5 +-
 .../llm_judge/launch_llm_judge_endpoint.sh    |  5 +-
 .../llm_judge/test_llm_endpoint.py            |  8 ++-
 .../run_benchmark/run_generate_answer.sh      |  3 +
 .../crag_eval/run_benchmark/run_grading.sh    |  3 +
 evals/metrics/ragas/ragas.py                  | 30 +++++----
 17 files changed, 178 insertions(+), 116 deletions(-)

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
index a277c136..1e5ce1c2 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/crag_eval/README.md
@@ -104,4 +104,3 @@ python3 test_llm_endpoint.py
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/
 bash run_grading.sh
 ```
-
diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/crag_eval/docker/build_image.sh
index 284762c7..a743900f 100644
--- a/evals/evaluation/crag_eval/docker/build_image.sh
+++ b/evals/evaluation/crag_eval/docker/build_image.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 dockerfile=Dockerfile
 
 docker build \
@@ -6,4 +9,4 @@ docker build \
     --network=host \
     --build-arg http_proxy=${http_proxy} \
     --build-arg https_proxy=${https_proxy} \
-    --build-arg no_proxy=${no_proxy} \
\ No newline at end of file
+    --build-arg no_proxy=${no_proxy} \
diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/crag_eval/docker/launch_eval_container.sh
index 32b27f8e..8698f452 100644
--- a/evals/evaluation/crag_eval/docker/launch_eval_container.sh
+++ b/evals/evaluation/crag_eval/docker/launch_eval_container.sh
@@ -1,4 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 volume=$WORKDIR
 host_ip=$(hostname -I | awk '{print $1}')
 
-docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest
\ No newline at end of file
+docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest
diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt
index 4579eca7..b32606b7 100644
--- a/evals/evaluation/crag_eval/docker/requirements.txt
+++ b/evals/evaluation/crag_eval/docker/requirements.txt
@@ -1,8 +1,8 @@
-pandas
 datasets
-ragas
-langchain-community
-langchain-huggingface
 evaluate
 jieba
-sentence_transformers
\ No newline at end of file
+langchain-community
+langchain-huggingface
+pandas
+ragas
+sentence_transformers
diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py
index 6f0c6775..2f5dea00 100644
--- a/evals/evaluation/crag_eval/preprocess_data/process_data.py
+++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py
@@ -1,10 +1,16 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
 import json
 import os
-import argparse
+
 import tqdm
 
+
 def split_text(text, chunk_size=2000, chunk_overlap=400):
     from langchain_text_splitters import RecursiveCharacterTextSplitter
+
     text_splitter = RecursiveCharacterTextSplitter(
         # Set a really small chunk size, just to show.
         chunk_size=chunk_size,
@@ -15,14 +21,16 @@ def split_text(text, chunk_size=2000, chunk_overlap=400):
     )
     return text_splitter.split_text(text)
 
+
 def process_html_string(text):
     from bs4 import BeautifulSoup
+
     # print(text)
     soup = BeautifulSoup(text, features="html.parser")
 
     # kill all script and style elements
     for script in soup(["script", "style"]):
-        script.extract()    # rip it out
+        script.extract()  # rip it out
 
     # get text
     text_content = soup.get_text()
@@ -32,21 +40,22 @@ def process_html_string(text):
     # break multi-headlines into a line each
     chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
     # drop blank lines
-    final_text = '\n'.join(chunk for chunk in chunks if chunk)
+    final_text = "\n".join(chunk for chunk in chunks if chunk)
     # print(final_text)
     return final_text
 
+
 def preprocess_data(input_file):
     snippet = []
     return_data = []
     n = 0
-    with open(input_file, 'r') as f:
+    with open(input_file, "r") as f:
         for line in f:
             data = json.loads(line)
 
             # search results snippets --> retrieval corpus docs
-            docs = data['search_results']
-            
+            docs = data["search_results"]
+
             for doc in docs:
                 # chunks = split_text(doc['page_snippet'])
                 # for chunk in chunks:
@@ -54,30 +63,27 @@ def preprocess_data(input_file):
                 #         "query": data['query'],
                 #         "domain": data['domain'],
                 #         "doc":chunk})
-                snippet.append({
-                    "query": data['query'],
-                    "domain": data['domain'],
-                    "doc":doc['page_snippet']})                
-                print('-----------------------------------')
-            
+                snippet.append({"query": data["query"], "domain": data["domain"], "doc": doc["page_snippet"]})
+                print("-----------------------------------")
+
             # qa pairs without search results
             output = {}
             for k, v in data.items():
-                if k != 'search_results':
+                if k != "search_results":
                     output[k] = v
             return_data.append(output)
 
     return snippet, return_data
 
-    
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--filedir', type=str, default=None)
-    parser.add_argument('--docout', type=str, default=None)
-    parser.add_argument('--qaout', type=str, default=None)
+    parser.add_argument("--filedir", type=str, default=None)
+    parser.add_argument("--docout", type=str, default=None)
+    parser.add_argument("--qaout", type=str, default=None)
     # parser.add_argument('--chunk_size', type=int, default=10000)
     # parser.add_argument('--chunk_overlap', type=int, default=0)
-    
+
     args = parser.parse_args()
 
     if not os.path.exists(args.docout):
@@ -95,17 +101,17 @@ def preprocess_data(input_file):
         doc, data = preprocess_data(file)
         docs.extend(doc)
         qa_pairs.extend(data)
-    
+
     # group by domain
     domains = ["finance", "music", "movie", "sports", "open"]
 
     for domain in domains:
-        with open(os.path.join(args.docout, "crag_docs_"+domain + ".jsonl"), 'w') as f:
+        with open(os.path.join(args.docout, "crag_docs_" + domain + ".jsonl"), "w") as f:
             for doc in docs:
-                if doc['doc']!="" and doc['domain'] == domain:
-                    f.write(json.dumps(doc) + '\n')
+                if doc["doc"] != "" and doc["domain"] == domain:
+                    f.write(json.dumps(doc) + "\n")
 
-        with open(os.path.join(args.qaout, "crag_qa_"+domain + ".jsonl"), 'w') as f:
+        with open(os.path.join(args.qaout, "crag_qa_" + domain + ".jsonl"), "w") as f:
             for d in qa_pairs:
-                if d['domain'] == domain:
-                    f.write(json.dumps(d) + '\n')
\ No newline at end of file
+                if d["domain"] == domain:
+                    f.write(json.dumps(d) + "\n")
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
index fc6f19be..d93a5fa1 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
@@ -1,5 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4
 DOCOUT=$WORKDIR/datasets/crag_docs
 QAOUT=$WORKDIR/datasets/crag_qas
 
-python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
\ No newline at end of file
+python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
index 06514702..e0e3b0c8 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 FILEDIR=$WORKDIR/datasets/crag_docs
 
-python3 sample_data.py --filedir $FILEDIR
\ No newline at end of file
+python3 sample_data.py --filedir $FILEDIR
diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
index 1fcb4916..f4aa8209 100644
--- a/evals/evaluation/crag_eval/preprocess_data/sample_data.py
+++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
@@ -1,26 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
 import json
-import pandas as pd
 import os
-import argparse
+
+import pandas as pd
 import tqdm
 
+
 def sample_data(input_file, output_file):
     df = pd.read_json(input_file, lines=True, convert_dates=False)
     # group by `question_type` and `static_or_dynamic`
-    df_grouped = df.groupby(['question_type', 'static_or_dynamic'])
+    df_grouped = df.groupby(["question_type", "static_or_dynamic"])
     # sample 5 rows from each group if there are more than 5 rows else return all rows
     df_sampled = df_grouped.apply(lambda x: x.sample(5) if len(x) > 5 else x)
     # save sampled data to output file
-    df_sampled.to_json(output_file, orient='records', lines=True)
+    df_sampled.to_json(output_file, orient="records", lines=True)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--filedir', type=str, default=None)
-    
+    parser.add_argument("--filedir", type=str, default=None)
+
     args = parser.parse_args()
 
     data_files = os.listdir(args.filedir)
     for file in tqdm.tqdm(data_files):
         file = os.path.join(args.filedir, file)
-        output_file = file.replace('.jsonl', '_sampled.jsonl')
-        sample_data(file, output_file)
\ No newline at end of file
+        output_file = file.replace(".jsonl", "_sampled.jsonl")
+        sample_data(file, output_file)
diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
index 6c55da3c..bad9b768 100644
--- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
@@ -1,42 +1,51 @@
-import requests
-import json
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
+import json
 import os
+
 import pandas as pd
+import requests
+
 
 def get_test_data(args):
-    if args.query_file.endswith('.jsonl'):
+    if args.query_file.endswith(".jsonl"):
         df = pd.read_json(args.query_file, lines=True, convert_dates=False)
-    elif args.query_file.endswith('.csv'):
+    elif args.query_file.endswith(".csv"):
         df = pd.read_csv(args.query_file)
     return df
 
+
 def generate_answer(url, prompt):
     proxies = {"http": ""}
     payload = {
-        "query":prompt,
-        } 
+        "query": prompt,
+    }
     response = requests.post(url, json=payload, proxies=proxies)
     answer = response.json()["text"]
     return answer
 
-def save_results(output_file, output_list):       
+
+def save_results(output_file, output_list):
     with open(output_file, "w") as f:
         for output in output_list:
             f.write(json.dumps(output))
             f.write("\n")
 
+
 def save_as_csv(output):
     df = pd.read_json(output, lines=True, convert_dates=False)
     df.to_csv(output.replace(".jsonl", ".csv"), index=False)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--endpoint_url", type=str, default=None, help="url of the agent QnA system endpoint")
     parser.add_argument("--query_file", type=str, default=None, help="query jsonl file")
     parser.add_argument("--output_file", type=str, default="output.jsonl", help="output jsonl file")
     args = parser.parse_args()
-    
+
     url = args.endpoint_url
 
     df = get_test_data(args)
@@ -45,27 +54,27 @@ def save_as_csv(output):
     output_list = []
     n = 0
     for _, row in df.iterrows():
-        q = row['query']
-        t = row['query_time']
+        q = row["query"]
+        t = row["query_time"]
         prompt = "Question: {}\nThe question was asked at: {}".format(q, t)
-        print('******Query:\n',prompt)
+        print("******Query:\n", prompt)
         print("******Agent is working on the query")
         answer = generate_answer(url, prompt)
-        print('******Answer from agent:\n',answer)
-        print('='*50)
+        print("******Answer from agent:\n", answer)
+        print("=" * 50)
         output_list.append(
-                {
-                    "query": q,
-                    "query_time": t,
-                    "ref_answer": row["answer"],
-                    "answer": answer,
-                    "question_type": row["question_type"],
-                    "static_or_dynamic": row["static_or_dynamic"],
-                }
-            )
+            {
+                "query": q,
+                "query_time": t,
+                "ref_answer": row["answer"],
+                "answer": answer,
+                "question_type": row["question_type"],
+                "static_or_dynamic": row["static_or_dynamic"],
+            }
+        )
         save_results(args.output_file, output_list)
         # n += 1
         # if n > 1:
         #     break
     save_results(args.output_file, output_list)
-    save_as_csv(args.output_file)
\ No newline at end of file
+    save_as_csv(args.output_file)
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
index ddeecdb6..094a87aa 100644
--- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py
@@ -1,18 +1,24 @@
-from evals.metrics.ragas import RagasMetric
-from ragas.metrics import answer_correctness
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
-import pandas as pd
 import os
 
+import pandas as pd
+from ragas.metrics import answer_correctness
+
+from evals.metrics.ragas import RagasMetric
+
+
 def convert_data_format_for_ragas(data):
     # data: pandas dataframe
     # columns: ['query', 'answer', 'ref_answer']
     # return: a dict with keys: 'input', 'actual_output', 'expected_output'
     output = {
-        'input': data['query'].tolist(),
-        'actual_output': data['answer'].tolist(),
-        'expected_output': data['ref_answer'].tolist(),
-        'retrieval_context': [["dummy_context"] for _ in range(data['query'].shape[0])]
+        "input": data["query"].tolist(),
+        "actual_output": data["answer"].tolist(),
+        "expected_output": data["ref_answer"].tolist(),
+        "retrieval_context": [["dummy_context"] for _ in range(data["query"].shape[0])],
     }
     return output
 
@@ -25,44 +31,48 @@ def make_list_of_test_cases(data):
     for _, row in data.iterrows():
         output.append(
             {
-                'input': [row['query']],
-                'actual_output': [row['answer']],
-                'expected_output': [row['ref_answer']],
-                'retrieval_context': [["dummy_context"]]
+                "input": [row["query"]],
+                "actual_output": [row["answer"]],
+                "expected_output": [row["ref_answer"]],
+                "retrieval_context": [["dummy_context"]],
             }
         )
     return output
 
+
 def grade_answers(args, test_case):
     from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-    print('==============getting embeddings==============')
+
+    print("==============getting embeddings==============")
     embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model)
-    print('==============initiating metric==============')
-    metric = RagasMetric(threshold=0.5, 
-                         metrics=["answer_correctness"],
-                         model= args.llm_endpoint, 
-                         embeddings=embeddings)
-    print('==============start grading==============')
+    print("==============initiating metric==============")
+    metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, embeddings=embeddings)
+    print("==============start grading==============")
 
     if args.batch_grade:
         metric.measure(test_case)
-        return metric.score['answer_correctness']
+        return metric.score["answer_correctness"]
     else:
         scores = []
         for case in test_case:
             metric.measure(case)
-            scores.append(metric.score['answer_correctness'])
+            scores.append(metric.score["answer_correctness"])
             print(metric.score)
-            print('-'*50)
+            print("-" * 50)
         return scores
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--embed_model", type=str, default="BAAI/bge-base-en-v1.5")
     parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008")
     parser.add_argument("--filedir", type=str, help="Path to the file containing the data")
     parser.add_argument("--filename", type=str, help="Name of the file containing the data")
-    parser.add_argument("--batch_grade", action="store_true", help="Grade the answers in batch and get an aggregated score for the entire dataset")
+    parser.add_argument(
+        "--batch_grade",
+        action="store_true",
+        help="Grade the answers in batch and get an aggregated score for the entire dataset",
+    )
     args = parser.parse_args()
 
     data = pd.read_csv(os.path.join(args.filedir, args.filename))
@@ -71,7 +81,7 @@ def grade_answers(args, test_case):
         test_case = convert_data_format_for_ragas(data)
     else:
         test_case = make_list_of_test_cases(data)
-    
+
     # print(test_case)
 
     scores = grade_answers(args, test_case)
@@ -80,11 +90,8 @@ def grade_answers(args, test_case):
     if args.batch_grade:
         print("Aggregated answer correctness score: ", scores)
     else:
-        data['answer_correctness'] = scores
-        print("Average answer correctness score: ", data['answer_correctness'].mean())
-        output_file = args.filename.split('.')[0] + '_graded.csv'
+        data["answer_correctness"] = scores
+        print("Average answer correctness score: ", data["answer_correctness"].mean())
+        output_file = args.filename.split(".")[0] + "_graded.csv"
         data.to_csv(os.path.join(args.filedir, output_file), index=False)
         print("Scores saved to ", os.path.join(args.filedir, output_file))
-        
-
-
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
index 56b1e6dd..572011ef 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 services:
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:latest
@@ -20,4 +23,4 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4
\ No newline at end of file
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
index 4f6f7f23..a954098e 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 version: "3.8"
 
 services:
@@ -16,4 +19,4 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
\ No newline at end of file
+    command: --model-id ${LLM_MODEL_ID}
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
index 5b13d01d..0cb08d8f 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
@@ -1,4 +1,7 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct"
 export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
 export HF_CACHE_DIR=${HF_CACHE_DIR}
-docker compose -f docker-compose-llm-judge-gaudi.yaml up -d
\ No newline at end of file
+docker compose -f docker-compose-llm-judge-gaudi.yaml up -d
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
index ce961173..c23f6af9 100644
--- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
+++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
@@ -1,7 +1,11 @@
-from langchain_huggingface import HuggingFaceEndpoint
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 
-host_ip=os.environ.get("host_ip", "localhost")
+from langchain_huggingface import HuggingFaceEndpoint
+
+host_ip = os.environ.get("host_ip", "localhost")
 url = "http://{host_ip}:8085".format(host_ip=host_ip)
 print(url)
 
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
index d4529fdb..b8e594e4 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent
 port=9095 # change this to the port of the agent
 endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
index 82cae435..ac432787 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 FILEDIR=$WORKDIR/datasets/crag_results/
 FILENAME=results.csv
 LLM_ENDPOINT=http://${host_ip}:8085
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index d80a3745..06bf96da 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -7,9 +7,9 @@
 import os
 from typing import Dict, Optional, Union
 
-from langchain_huggingface import HuggingFaceEndpoint
 from langchain_core.embeddings import Embeddings
 from langchain_core.language_models import BaseLanguageModel
+from langchain_huggingface import HuggingFaceEndpoint
 
 
 def format_ragas_metric_name(name: str):
@@ -20,22 +20,26 @@ def get_metric(name: str):
     validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"]
     if name == "answer_relevancy":
         from ragas.metrics import answer_relevancy
+
         return answer_relevancy
     elif name == "faithfulness":
         from ragas.metrics import faithfulness
+
         return faithfulness
     elif name == "answer_correctness":
         from ragas.metrics import answer_correctness
+
         return answer_correctness
     else:
         raise ValueError(
-                        "metric should be in supported list {}. ".format(validated_list)
-                        + "ClientResponseError raised with LangchainLLM "
-                        + "when context_precision, context_recall ran. "
-                        + "Here are the related issues described in ragas "
-                        "https://github.com/explodinggradients/ragas/issues/934, "
-                        + "https://github.com/explodinggradients/ragas/issues/664."
-                    )
+            "metric should be in supported list {}. ".format(validated_list)
+            + "ClientResponseError raised with LangchainLLM "
+            + "when context_precision, context_recall ran. "
+            + "Here are the related issues described in ragas "
+            "https://github.com/explodinggradients/ragas/issues/934, "
+            + "https://github.com/explodinggradients/ragas/issues/664."
+        )
+
 
 class RagasMetric:
     """This metric checks if the output is more than 3 letters."""
@@ -81,7 +85,7 @@ def __init__(
             print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.")
             self.model = None
         if isinstance(self.model, str):
-            print('LLM endpoint: ', self.model)
+            print("LLM endpoint: ", self.model)
             self.chat_model = HuggingFaceEndpoint(
                 endpoint_url=self.model,
                 task="text-generation",
@@ -90,7 +94,7 @@ def __init__(
             )
         else:
             self.chat_model = self.model
-        
+
         # initialize metrics
         if self.metrics is not None:
             tmp_metrics = []
@@ -110,7 +114,7 @@ def __init__(
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(get_metric(metric))
             self.metrics = tmp_metrics
-        else: # default metrics
+        else:  # default metrics
             self.metrics = [
                 answer_relevancy,
                 faithfulness,
@@ -121,17 +125,17 @@ def __init__(
                 context_recall,
             ]
 
-
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
 
     def measure(self, test_case: Dict):
         from ragas import evaluate
+
         try:
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-        
+
         # Create a dataset from the test case
         # Convert the Dict to a format compatible with Dataset
         data = {

From 90e855bd43071b3ea04ad7dc4a831768817a96d6 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Tue, 27 Aug 2024 15:39:56 -0700
Subject: [PATCH 13/18] update readme and test all commands

---
 evals/evaluation/crag_eval/README.md          | 29 +++++++++++++++----
 .../crag_eval/preprocess_data/process_data.py |  5 +++-
 .../preprocess_data/run_data_preprocess.sh    |  6 ++--
 .../preprocess_data/run_sample_data.sh        |  2 +-
 .../crag_eval/preprocess_data/sample_data.py  |  1 +
 .../run_benchmark/generate_answers.py         |  3 ++
 .../run_benchmark/run_generate_answer.sh      |  4 +--
 .../crag_eval/run_benchmark/run_grading.sh    |  4 +--
 8 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md
index 1e5ce1c2..df7fd772 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/crag_eval/README.md
@@ -45,17 +45,22 @@ Data preprocessing directly relates to the quality of retrieval corpus and thus
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data
 bash run_data_preprocess.sh
 ```
-3. Optional - Sample queries for benchmark
+**Note**: This is an example of data processing. You can develop and optimize your own data processing for this benchmark.
+3. Sample queries for benchmark
 The CRAG dataset has more than 4000 queries, and running all of them can be very expensive and time-consuming. You can sample a subset for benchmark. Here we provide a script to sample up to 5 queries per question_type per dynamism in each domain. For example, we were able to get 92 queries from the music domain using the script.
 ```
 bash run_sample_data.sh
 ```
 
 ## Launch agent QnA system
-Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
+Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the [AgentQnA example](https://github.com/opea-project/GenAIExamples/tree/main/AgentQnA) for more details. </br>
+**Please note**: This is an example. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.</br>
+To launch the agent in our AgentQnA example, open another terminal and build images and launch agent system there.
 1. Build images
 ```
-git clone
+export $WORKDIR=<your-work-directory>
+cd $WORKDIR
+git clone https://github.com/opea-project/GenAIExamples.git
 cd GenAIExamples/AgentQnA/tests/
 bash 1_build_images.sh
 ```
@@ -65,17 +70,27 @@ bash 2_start_retrieval_tool.sh
 ```
 3. Ingest data into vector database and validate retrieval tool
 ```
-# Placeholder - may change depending on data
-bash 3_ingest_data_and_validate_retrieval.sh
+# As an example, we will use the index_data.py script in AgentQnA example.
+# You can write your own script to ingest data.
+# As an example, We will ingest the docs of the music domain.
+# We will use the crag-eval docker container to run the index_data.py script.
+# The index_data.py is a client script.
+# it will send data-indexing requests to the dataprep server that is part of the retrieval tool.
+# So you need to switch back to the terminal where the crag-eval container is running.
+cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/
+python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs/ --filename crag_docs_music.jsonl
 ```
-3. Launch and validate agent endpoint
+4. Launch and validate agent endpoint
 ```
+# Go to the terminal where you launched the AgentQnA example
+cd $WORKDIR/GenAIExamples/AgentQnA/tests/
 bash 4_launch_and_validate_agent.sh
 ```
 
 ## Run CRAG benchmark
 Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
 ```
+# Come back to the interactive crag-eval docker container
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark
 bash run_generate_answer.sh
 ```
@@ -96,11 +111,13 @@ curl ${host_ip}:8085/generate_stream \
 ```
 And then go back to the interactive crag-eval docker, run command below.
 ```
+# Inside the crag-eval container
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/
 python3 test_llm_endpoint.py
 ```
 3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py).
 ```
+# Inside the crag-eval container
 cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/
 bash run_grading.sh
 ```
diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py
index 2f5dea00..e56df9e1 100644
--- a/evals/evaluation/crag_eval/preprocess_data/process_data.py
+++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py
@@ -64,7 +64,6 @@ def preprocess_data(input_file):
                 #         "domain": data['domain'],
                 #         "doc":chunk})
                 snippet.append({"query": data["query"], "domain": data["domain"], "doc": doc["page_snippet"]})
-                print("-----------------------------------")
 
             # qa pairs without search results
             output = {}
@@ -73,6 +72,10 @@ def preprocess_data(input_file):
                     output[k] = v
             return_data.append(output)
 
+            n+=1
+            if n == 10:
+                break
+
     return snippet, return_data
 
 
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
index d93a5fa1..780f5f29 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4
-DOCOUT=$WORKDIR/datasets/crag_docs
-QAOUT=$WORKDIR/datasets/crag_qas
+DOCOUT=$WORKDIR/datasets/crag_docs/
+QAOUT=$WORKDIR/datasets/crag_qas/
 
-python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT
+python3 process_data.py --filedir $FILEDIR --docout $DOCOUT --qaout $QAOUT
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
index e0e3b0c8..dd104326 100644
--- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
+++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
@@ -1,6 +1,6 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-FILEDIR=$WORKDIR/datasets/crag_docs
+FILEDIR=$WORKDIR/datasets/crag_qas
 
 python3 sample_data.py --filedir $FILEDIR
diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
index f4aa8209..51621194 100644
--- a/evals/evaluation/crag_eval/preprocess_data/sample_data.py
+++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py
@@ -27,6 +27,7 @@ def sample_data(input_file, output_file):
 
     data_files = os.listdir(args.filedir)
     for file in tqdm.tqdm(data_files):
+        print(file)
         file = os.path.join(args.filedir, file)
         output_file = file.replace(".jsonl", "_sampled.jsonl")
         sample_data(file, output_file)
diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
index bad9b768..06eed7e7 100644
--- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
@@ -51,6 +51,9 @@ def save_as_csv(output):
     df = get_test_data(args)
     # df = df.head() # for validation purpose
 
+    if not os.path.exists(os.path.dirname(args.output_file)):
+        os.makedirs(os.path.dirname(args.output_file))
+    
     output_list = []
     n = 0
     for _, row in df.iterrows():
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
index b8e594e4..ee863bba 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
@@ -1,14 +1,14 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent
+host_ip=$host_ip # change this to the host IP of the agent
 port=9095 # change this to the port of the agent
 endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent
 URL="http://${host_ip}:${endpoint}"
 echo "AGENT ENDPOINT URL: ${URL}"
 
 QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl
-OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl
+OUTPUTFILE=$WORKDIR/datasets/crag_results/crag_music_sampled_results.jsonl
 
 python3 generate_answers.py \
 --endpoint_url ${URL} \
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
index ac432787..5431d39b 100644
--- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
+++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh
@@ -2,8 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 FILEDIR=$WORKDIR/datasets/crag_results/
-FILENAME=results.csv
-LLM_ENDPOINT=http://${host_ip}:8085
+FILENAME=crag_music_sampled_results.csv
+LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint
 
 python3 grade_answers.py \
 --filedir $FILEDIR \

From 153e30fd3370eced47ee50dfdbe490dc0c1a0e6a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 27 Aug 2024 22:40:15 +0000
Subject: [PATCH 14/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/evaluation/crag_eval/preprocess_data/process_data.py   | 2 +-
 evals/evaluation/crag_eval/run_benchmark/generate_answers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py
index e56df9e1..f8f4bb39 100644
--- a/evals/evaluation/crag_eval/preprocess_data/process_data.py
+++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py
@@ -72,7 +72,7 @@ def preprocess_data(input_file):
                     output[k] = v
             return_data.append(output)
 
-            n+=1
+            n += 1
             if n == 10:
                 break
 
diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
index 06eed7e7..19f7f747 100644
--- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
+++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py
@@ -53,7 +53,7 @@ def save_as_csv(output):
 
     if not os.path.exists(os.path.dirname(args.output_file)):
         os.makedirs(os.path.dirname(args.output_file))
-    
+
     output_list = []
     n = 0
     for _, row in df.iterrows():

From d51c0832105570888bee4d269c526773c1e81ccc Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 9 Sep 2024 09:47:43 -0700
Subject: [PATCH 15/18] mv crag_eval to agent_eval

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../{ => agent_eval}/crag_eval/README.md      | 10 ++--
 .../crag_eval/docker/Dockerfile               |  0
 .../crag_eval/docker/build_image.sh           |  0
 .../crag_eval/docker/launch_eval_container.sh |  0
 .../crag_eval/docker/requirements.txt         |  0
 .../crag_eval/preprocess_data/process_data.py |  0
 .../preprocess_data/run_data_preprocess.sh    |  0
 .../preprocess_data/run_sample_data.sh        |  0
 .../crag_eval/preprocess_data/sample_data.py  |  0
 .../run_benchmark/generate_answers.py         |  0
 .../crag_eval/run_benchmark/grade_answers.py  |  0
 .../docker-compose-llm-judge-gaudi.yaml       |  0
 .../llm_judge/docker-compose-llm-judge.yaml   |  0
 .../llm_judge/launch_llm_judge_endpoint.sh    |  0
 .../llm_judge/test_llm_endpoint.py            |  0
 .../run_benchmark/run_generate_answer.sh      |  0
 .../crag_eval/run_benchmark/run_grading.sh    |  0
 evals/metrics/ragas/ragas.py                  | 53 ++++++++++---------
 18 files changed, 34 insertions(+), 29 deletions(-)
 rename evals/evaluation/{ => agent_eval}/crag_eval/README.md (94%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/docker/Dockerfile (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/docker/build_image.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/docker/launch_eval_container.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/docker/requirements.txt (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/process_data.py (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/run_data_preprocess.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/run_sample_data.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/sample_data.py (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/generate_answers.py (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/grade_answers.py (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/run_generate_answer.sh (100%)
 rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/run_grading.sh (100%)

diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md
similarity index 94%
rename from evals/evaluation/crag_eval/README.md
rename to evals/evaluation/agent_eval/crag_eval/README.md
index df7fd772..7b66f8a0 100644
--- a/evals/evaluation/crag_eval/README.md
+++ b/evals/evaluation/agent_eval/crag_eval/README.md
@@ -11,7 +11,7 @@ git clone https://github.com/opea-project/GenAIEval.git
 ```
 2. Build docker image
 ```
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/
+cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/docker/
 bash build_image.sh
 ```
 3. Set environment vars for downloading models from Huggingface
@@ -42,7 +42,7 @@ tar -xf crag_task_3_dev_v4.tar.bz2
 2. Preprocess the CRAG data
 Data preprocessing directly relates to the quality of retrieval corpus and thus can have significant impact on the agent QnA system. Here, we provide one way of preprocessing the data where we simply extracts all the web search snippets as-is from the dataset per domain. We also extract all the query-answer pairs along with other meta data per domain. You can run the command below to use our method. The data processing will take some time to finish.
 ```
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data
+cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/preprocess_data
 bash run_data_preprocess.sh
 ```
 **Note**: This is an example of data processing. You can develop and optimize your own data processing for this benchmark.
@@ -91,7 +91,7 @@ bash 4_launch_and_validate_agent.sh
 Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain.
 ```
 # Come back to the interactive crag-eval docker container
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark
+cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark
 bash run_generate_answer.sh
 ```
 
@@ -112,12 +112,12 @@ curl ${host_ip}:8085/generate_stream \
 And then go back to the interactive crag-eval docker, run command below.
 ```
 # Inside the crag-eval container
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/
+cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/
 python3 test_llm_endpoint.py
 ```
 3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py).
 ```
 # Inside the crag-eval container
-cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/
+cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark/
 bash run_grading.sh
 ```
diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile
similarity index 100%
rename from evals/evaluation/crag_eval/docker/Dockerfile
rename to evals/evaluation/agent_eval/crag_eval/docker/Dockerfile
diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
similarity index 100%
rename from evals/evaluation/crag_eval/docker/build_image.sh
rename to evals/evaluation/agent_eval/crag_eval/docker/build_image.sh
diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
similarity index 100%
rename from evals/evaluation/crag_eval/docker/launch_eval_container.sh
rename to evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh
diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
similarity index 100%
rename from evals/evaluation/crag_eval/docker/requirements.txt
rename to evals/evaluation/agent_eval/crag_eval/docker/requirements.txt
diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/agent_eval/crag_eval/preprocess_data/process_data.py
similarity index 100%
rename from evals/evaluation/crag_eval/preprocess_data/process_data.py
rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/process_data.py
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/agent_eval/crag_eval/preprocess_data/run_data_preprocess.sh
similarity index 100%
rename from evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh
rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/run_data_preprocess.sh
diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/agent_eval/crag_eval/preprocess_data/run_sample_data.sh
similarity index 100%
rename from evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh
rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/run_sample_data.sh
diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/agent_eval/crag_eval/preprocess_data/sample_data.py
similarity index 100%
rename from evals/evaluation/crag_eval/preprocess_data/sample_data.py
rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/sample_data.py
diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/generate_answers.py
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/generate_answers.py
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/generate_answers.py
diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/grade_answers.py
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh
diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh
diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
similarity index 100%
rename from evals/evaluation/crag_eval/run_benchmark/run_grading.sh
rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh
diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 06bf96da..ac971364 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -16,29 +16,8 @@ def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
 
-def get_metric(name: str):
-    validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"]
-    if name == "answer_relevancy":
-        from ragas.metrics import answer_relevancy
-
-        return answer_relevancy
-    elif name == "faithfulness":
-        from ragas.metrics import faithfulness
-
-        return faithfulness
-    elif name == "answer_correctness":
-        from ragas.metrics import answer_correctness
-
-        return answer_correctness
-    else:
-        raise ValueError(
-            "metric should be in supported list {}. ".format(validated_list)
-            + "ClientResponseError raised with LangchainLLM "
-            + "when context_precision, context_recall ran. "
-            + "Here are the related issues described in ragas "
-            "https://github.com/explodinggradients/ragas/issues/934, "
-            + "https://github.com/explodinggradients/ragas/issues/664."
-        )
+
+    
 
 
 class RagasMetric:
@@ -112,7 +91,7 @@ def __init__(
                 else:
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
-                    tmp_metrics.append(get_metric(metric))
+                    tmp_metrics.append(self.get_metric(metric))
             self.metrics = tmp_metrics
         else:  # default metrics
             self.metrics = [
@@ -125,6 +104,32 @@ def __init__(
                 context_recall,
             ]
 
+    def get_metric(name: str):
+        if name == "answer_relevancy":
+            from ragas.metrics import answer_relevancy
+            return answer_relevancy
+        elif name == "faithfulness":
+            from ragas.metrics import faithfulness
+            return faithfulness
+        elif name == "answer_correctness":
+            from ragas.metrics import answer_correctness
+            return answer_correctness
+        elif name == "answer_similarity":
+            from ragas.metrics import answer_similarity
+            return answer_similarity
+        elif name == "context_precision":
+            from ragas.metrics import context_precision
+            return context_precision
+        elif name == "context_relevancy":
+            from ragas.metrics import context_relevancy
+            return context_relevancy
+        elif name == "context_recall":
+            from ragas.metrics import context_recall
+            return context_recall
+        else:
+            raise ValueError(f"The {name} metric has not been validated.")
+    
+
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)
 

From e4cd9c6977e2a679b86a44e6b32086ae2b155e98 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:22:37 +0000
Subject: [PATCH 16/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 evals/metrics/ragas/ragas.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py
index 93ee0bde..35449c08 100644
--- a/evals/metrics/ragas/ragas.py
+++ b/evals/metrics/ragas/ragas.py
@@ -16,10 +16,6 @@ def format_ragas_metric_name(name: str):
     return f"{name} (ragas)"
 
 
-
-    
-
-
 class RagasMetric:
     """This metric checks if the output is more than 3 letters."""
 
@@ -64,7 +60,7 @@ def __init__(
             from datasets import Dataset
         except ModuleNotFoundError:
             raise ModuleNotFoundError("Please install dataset")
-            
+
         self.metrics_instance = {
             "answer_correctness": answer_correctness,
             "answer_relevancy": answer_relevancy,
@@ -110,9 +106,9 @@ def __init__(
                     if metric == "answer_relevancy" and self.embeddings is None:
                         raise ValueError("answer_relevancy metric need provide embeddings model.")
                     tmp_metrics.append(self.metrics_instance[metric])
-                    
+
             self.metrics = tmp_metrics
-            
+
         else:  # default metrics
             self.metrics = [
                 answer_relevancy,
@@ -122,7 +118,6 @@ def __init__(
                 context_precision,
                 context_recall,
             ]
-    
 
     async def a_measure(self, test_case: Dict):
         return self.measure(test_case)

From 69f87c220bc50041c75cd3967c9d7b17a6b1a4d7 Mon Sep 17 00:00:00 2001
From: minmin-intel <minmin.hou@intel.com>
Date: Mon, 9 Sep 2024 17:44:03 +0000
Subject: [PATCH 17/18] update test case col names in grade_answer.py

Signed-off-by: minmin-intel <minmin.hou@intel.com>
---
 .../crag_eval/run_benchmark/grade_answers.py  | 22 +++++++------------
 tests/test_ragas.py                           |  9 ++++----
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
index 094a87aa..8f95d497 100644
--- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
+++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py
@@ -11,30 +11,24 @@
 
 
 def convert_data_format_for_ragas(data):
-    # data: pandas dataframe
-    # columns: ['query', 'answer', 'ref_answer']
-    # return: a dict with keys: 'input', 'actual_output', 'expected_output'
     output = {
-        "input": data["query"].tolist(),
-        "actual_output": data["answer"].tolist(),
-        "expected_output": data["ref_answer"].tolist(),
-        "retrieval_context": [["dummy_context"] for _ in range(data["query"].shape[0])],
+        "question": data["query"].tolist(),
+        "answer": data["answer"].tolist(),
+        "ground_truth": data["ref_answer"].tolist(),
+        "contexts": [["dummy_context"] for _ in range(data["query"].shape[0])],
     }
     return output
 
 
 def make_list_of_test_cases(data):
-    # data: pandas dataframe
-    # columns: ['query', 'answer', 'ref_answer']
-    # return: a list of dicts with keys: 'input', 'actual_output', 'expected_output'
     output = []
     for _, row in data.iterrows():
         output.append(
             {
-                "input": [row["query"]],
-                "actual_output": [row["answer"]],
-                "expected_output": [row["ref_answer"]],
-                "retrieval_context": [["dummy_context"]],
+                "question": [row["query"]],
+                "answer": [row["answer"]],
+                "ground_truth": [row["ref_answer"]],
+                "contexts": [["dummy_context"]],
             }
         )
     return output
diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index e11835ad..eab71800 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -5,13 +5,14 @@
 
 
 import unittest
-
+import os
 from evals.metrics.ragas import RagasMetric
 
-
+host_ip = os.getenv("host_ip", "localhost")
+port = os.getenv("port", "8008")
 class TestRagasMetric(unittest.TestCase):
 
-    @unittest.skip("need pass localhost id")
+    # @unittest.skip("need pass localhost id")
     def test_ragas(self):
         # Replace this with the actual output from your LLM application
         actual_output = "We offer a 30-day full refund at no extra cost."
@@ -24,7 +25,7 @@ def test_ragas(self):
         from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 
         embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
-        metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings)
+        metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings)
         test_case = {
             "question": ["What if these shoes don't fit?"],
             "answer": [actual_output],

From 82894aa48ca6d97a0e61db5a97765e08ff2504a7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:39:12 +0000
Subject: [PATCH 18/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/test_ragas.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_ragas.py b/tests/test_ragas.py
index eab71800..3376b0b5 100644
--- a/tests/test_ragas.py
+++ b/tests/test_ragas.py
@@ -4,12 +4,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-import unittest
 import os
+import unittest
+
 from evals.metrics.ragas import RagasMetric
 
 host_ip = os.getenv("host_ip", "localhost")
 port = os.getenv("port", "8008")
+
+
 class TestRagasMetric(unittest.TestCase):
 
     # @unittest.skip("need pass localhost id")