From be548039cb0d54903aca5477d3124c02ceca3d5d Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Thu, 22 Aug 2024 16:29:01 -0700 Subject: [PATCH 01/18] add crag eval first pass code --- evals/evaluation/crag_eval/README.md | 79 +++++++++++++ .../crag_eval/preprocess_data/process_data.py | 111 ++++++++++++++++++ .../preprocess_data/run_data_preprocess.sh | 5 + .../preprocess_data/run_sample_data.sh | 3 + .../crag_eval/preprocess_data/sample_data.py | 26 ++++ evals/evaluation/crag_eval/requirements.txt | 1 + .../run_benchmark/generate_answers.py | 71 +++++++++++ .../crag_eval/run_benchmark/grade_answers.py | 8 ++ .../run_benchmark/run_generate_answer.sh | 13 ++ 9 files changed, 317 insertions(+) create mode 100644 evals/evaluation/crag_eval/README.md create mode 100644 evals/evaluation/crag_eval/preprocess_data/process_data.py create mode 100644 evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh create mode 100644 evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh create mode 100644 evals/evaluation/crag_eval/preprocess_data/sample_data.py create mode 100644 evals/evaluation/crag_eval/requirements.txt create mode 100644 evals/evaluation/crag_eval/run_benchmark/generate_answers.py create mode 100644 evals/evaluation/crag_eval/run_benchmark/grade_answers.py create mode 100644 evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md new file mode 100644 index 00000000..5f68b24c --- /dev/null +++ b/evals/evaluation/crag_eval/README.md @@ -0,0 +1,79 @@ +# CRAG Benchmark for Agent QnA systems +## Overview +[Comprehensive RAG (CRAG) benchmark](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024) was introduced by Meta in 2024 as a challenge in KDD conference. The CRAG benchmark has questions across five domains and eight question types, and provides a practical set-up to evaluate RAG systems. In particular, CRAG includes questions with answers that change from over seconds to over years; it considers entity popularity and covers not only head, but also torso and tail facts; it contains simple-fact questions as well as 7 types of complex questions such as comparison, aggregation and set questions to test the reasoning and synthesis capabilities of RAG solutions. Additionally, CRAG also provides mock APIs to query mock knowledge graphs so that developers can benchmark additional API calling capabilities for agents. Moreover, golden answers were provided in the dataset, which makes auto-evaluation with LLMs more robust. Therefore, CRAG benchmark is a realistic and comprehensive benchmark for agents. + +## Getting started +1. Setup a work directory and download this repo into your work directory. +``` +export $WORKDIR= +cd $WORKDIR +git clone https://github.com/opea-project/GenAIEval.git +``` +2. Create conda environment and install packages +``` +conda create -n crag-benchmark-env python=3.11 +conda activate crag-benchmark-env +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/ +pip install -r requirements.txt +``` +## CRAG dataset +1. Download original data and process it with commands below. +You need to create an account on the Meta CRAG challenge [website](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024). After login, go to this [link](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024/problems/meta-kdd-cup-24-crag-end-to-end-retrieval-augmented-generation/dataset_files) and download the `crag_task_3_dev_v4.tar.bz2` file. Then make a `datasets` directory in your work directory using the commands below. +``` +cd $WORKDIR +mkdir datasets +``` +Then put the `crag_task_3_dev_v4.tar.bz2` file in the `datasets` directory, and decompress it by running the command below. +``` +cd $WORKDIR/datasets +tar -xf crag_task_3_dev_v4.tar.bz2 +``` +2. Preprocess the CRAG data +Data preprocessing directly relates to the quality of retrieval corpus and thus can have significant impact on the agent QnA system. Here, we provide one way of preprocessing the data where we simply extracts all the web search snippets as-is from the dataset per domain. We also extract all the query-answer pairs along with other meta data per domain. You can run the command below to use our method. The data processing will take some time to finish. +``` +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data +bash run_data_preprocess.sh +``` +3. Optional - Sample queries for benchmark +The CRAG dataset has more than 4000 queries, and running all of them can be very expensive and time-consuming. You can sample a subset for benchmark. Here we provide a script to sample up to 5 queries per question_type per dynamism in each domain. For example, we were able to get 92 queries from the music domain using the script. +``` +bash run_sample_data.sh +``` +3. Use the small subset that we have processed for a quick run +``` +Small data files in this repo +``` +## Launch agent QnA system +Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark. +1. Build images +``` +git clone +cd GenAIExamples/AgentQnA/tests/ +bash 1_build_images.sh +``` +2. Start retrieval tool +``` +bash 2_start_retrieval_tool.sh +``` +3. Ingest data into vector database and validate retrieval tool +``` +# Placeholder - may change depending on data +bash 3_ingest_data_and_validate_retrieval.sh +``` +3. Launch and validate agent endpoint +``` +bash 4_launch_and_validate_agent.sh +``` + +## Run CRAG benchmark +Once you have your agent system up and running, you can follow the steps below to run the benchmark. +1. Generate answers with agent +Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain. +``` +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark +bash run_generate_answer.sh +``` +2. Use LLM-as-judge to grade the answers +``` +``` + diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py new file mode 100644 index 00000000..6f0c6775 --- /dev/null +++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py @@ -0,0 +1,111 @@ +import json +import os +import argparse +import tqdm + +def split_text(text, chunk_size=2000, chunk_overlap=400): + from langchain_text_splitters import RecursiveCharacterTextSplitter + text_splitter = RecursiveCharacterTextSplitter( + # Set a really small chunk size, just to show. + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + is_separator_regex=False, + separators=["\n\n", "\n", ".", "!"], + ) + return text_splitter.split_text(text) + +def process_html_string(text): + from bs4 import BeautifulSoup + # print(text) + soup = BeautifulSoup(text, features="html.parser") + + # kill all script and style elements + for script in soup(["script", "style"]): + script.extract() # rip it out + + # get text + text_content = soup.get_text() + + # break into lines and remove leading and trailing space on each + lines = (line.strip() for line in text_content.splitlines()) + # break multi-headlines into a line each + chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) + # drop blank lines + final_text = '\n'.join(chunk for chunk in chunks if chunk) + # print(final_text) + return final_text + +def preprocess_data(input_file): + snippet = [] + return_data = [] + n = 0 + with open(input_file, 'r') as f: + for line in f: + data = json.loads(line) + + # search results snippets --> retrieval corpus docs + docs = data['search_results'] + + for doc in docs: + # chunks = split_text(doc['page_snippet']) + # for chunk in chunks: + # snippet.append({ + # "query": data['query'], + # "domain": data['domain'], + # "doc":chunk}) + snippet.append({ + "query": data['query'], + "domain": data['domain'], + "doc":doc['page_snippet']}) + print('-----------------------------------') + + # qa pairs without search results + output = {} + for k, v in data.items(): + if k != 'search_results': + output[k] = v + return_data.append(output) + + return snippet, return_data + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--filedir', type=str, default=None) + parser.add_argument('--docout', type=str, default=None) + parser.add_argument('--qaout', type=str, default=None) + # parser.add_argument('--chunk_size', type=int, default=10000) + # parser.add_argument('--chunk_overlap', type=int, default=0) + + args = parser.parse_args() + + if not os.path.exists(args.docout): + os.makedirs(args.docout) + + if not os.path.exists(args.qaout): + os.makedirs(args.qaout) + + data_files = os.listdir(args.filedir) + + qa_pairs = [] + docs = [] + for file in tqdm.tqdm(data_files): + file = os.path.join(args.filedir, file) + doc, data = preprocess_data(file) + docs.extend(doc) + qa_pairs.extend(data) + + # group by domain + domains = ["finance", "music", "movie", "sports", "open"] + + for domain in domains: + with open(os.path.join(args.docout, "crag_docs_"+domain + ".jsonl"), 'w') as f: + for doc in docs: + if doc['doc']!="" and doc['domain'] == domain: + f.write(json.dumps(doc) + '\n') + + with open(os.path.join(args.qaout, "crag_qa_"+domain + ".jsonl"), 'w') as f: + for d in qa_pairs: + if d['domain'] == domain: + f.write(json.dumps(d) + '\n') \ No newline at end of file diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh new file mode 100644 index 00000000..8ecfa94f --- /dev/null +++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh @@ -0,0 +1,5 @@ +FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4 +DOCOUT=$WORKDIR/datasets/crag_docs +QAOUT=$WORKDIR/datasets/crag_qas + +python process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT \ No newline at end of file diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh new file mode 100644 index 00000000..f0bce763 --- /dev/null +++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh @@ -0,0 +1,3 @@ +FILEDIR=$WORKDIR/datasets/crag_docs + +python sample_data.py --filedir $FILEDIR \ No newline at end of file diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py new file mode 100644 index 00000000..1fcb4916 --- /dev/null +++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py @@ -0,0 +1,26 @@ +import json +import pandas as pd +import os +import argparse +import tqdm + +def sample_data(input_file, output_file): + df = pd.read_json(input_file, lines=True, convert_dates=False) + # group by `question_type` and `static_or_dynamic` + df_grouped = df.groupby(['question_type', 'static_or_dynamic']) + # sample 5 rows from each group if there are more than 5 rows else return all rows + df_sampled = df_grouped.apply(lambda x: x.sample(5) if len(x) > 5 else x) + # save sampled data to output file + df_sampled.to_json(output_file, orient='records', lines=True) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--filedir', type=str, default=None) + + args = parser.parse_args() + + data_files = os.listdir(args.filedir) + for file in tqdm.tqdm(data_files): + file = os.path.join(args.filedir, file) + output_file = file.replace('.jsonl', '_sampled.jsonl') + sample_data(file, output_file) \ No newline at end of file diff --git a/evals/evaluation/crag_eval/requirements.txt b/evals/evaluation/crag_eval/requirements.txt new file mode 100644 index 00000000..1411a4a0 --- /dev/null +++ b/evals/evaluation/crag_eval/requirements.txt @@ -0,0 +1 @@ +pandas \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py new file mode 100644 index 00000000..6c55da3c --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py @@ -0,0 +1,71 @@ +import requests +import json +import argparse +import os +import pandas as pd + +def get_test_data(args): + if args.query_file.endswith('.jsonl'): + df = pd.read_json(args.query_file, lines=True, convert_dates=False) + elif args.query_file.endswith('.csv'): + df = pd.read_csv(args.query_file) + return df + +def generate_answer(url, prompt): + proxies = {"http": ""} + payload = { + "query":prompt, + } + response = requests.post(url, json=payload, proxies=proxies) + answer = response.json()["text"] + return answer + +def save_results(output_file, output_list): + with open(output_file, "w") as f: + for output in output_list: + f.write(json.dumps(output)) + f.write("\n") + +def save_as_csv(output): + df = pd.read_json(output, lines=True, convert_dates=False) + df.to_csv(output.replace(".jsonl", ".csv"), index=False) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--endpoint_url", type=str, default=None, help="url of the agent QnA system endpoint") + parser.add_argument("--query_file", type=str, default=None, help="query jsonl file") + parser.add_argument("--output_file", type=str, default="output.jsonl", help="output jsonl file") + args = parser.parse_args() + + url = args.endpoint_url + + df = get_test_data(args) + # df = df.head() # for validation purpose + + output_list = [] + n = 0 + for _, row in df.iterrows(): + q = row['query'] + t = row['query_time'] + prompt = "Question: {}\nThe question was asked at: {}".format(q, t) + print('******Query:\n',prompt) + print("******Agent is working on the query") + answer = generate_answer(url, prompt) + print('******Answer from agent:\n',answer) + print('='*50) + output_list.append( + { + "query": q, + "query_time": t, + "ref_answer": row["answer"], + "answer": answer, + "question_type": row["question_type"], + "static_or_dynamic": row["static_or_dynamic"], + } + ) + save_results(args.output_file, output_list) + # n += 1 + # if n > 1: + # break + save_results(args.output_file, output_list) + save_as_csv(args.output_file) \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py new file mode 100644 index 00000000..b4110a5e --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -0,0 +1,8 @@ +from evals.metrics.answer_relevancy import AnswerRelevancyMetric +from evals.metrics.ragas import RagasMetric +from evals.metrics import bleu_score, rougeL_score + +# check data format requirements for each metric +# check answer relevancy vs ragas answer relevancy +# check answer correctness +# check if open-source llm can be used diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh new file mode 100644 index 00000000..24b477ee --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh @@ -0,0 +1,13 @@ +host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the service +port=9095 # change this to the port of the service +endpoint=${port}/v1/chat/completions # change this to the endpoint of the service +URL="http://${host_ip}:${endpoint}" +echo "AGENT ENDPOINT URL: ${URL}" + +QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl +OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl + +python generate_answers.py \ +--endpoint_url ${URL} \ +--query_file $QUERYFILE \ +--output_file $OUTPUTFILE From 6da78fe7f0735e42f3040415ede3b5ed350dfe86 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Fri, 23 Aug 2024 13:31:31 -0700 Subject: [PATCH 02/18] add first pass llm eval code --- evals/evaluation/crag_eval/README.md | 28 +++++++++-- evals/evaluation/crag_eval/requirements.txt | 1 - .../crag_eval/run_benchmark/grade_answers.py | 50 ++++++++++++++++--- .../llm_judge/docker-compose-llm-judge.yaml | 19 +++++++ .../llm_judge/launch_llm_judge_endpoint.sh | 4 ++ .../crag_eval/run_benchmark/run_grading.sh | 8 +++ 6 files changed, 98 insertions(+), 12 deletions(-) delete mode 100644 evals/evaluation/crag_eval/requirements.txt create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh create mode 100644 evals/evaluation/crag_eval/run_benchmark/run_grading.sh diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md index 5f68b24c..52e2e028 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/crag_eval/README.md @@ -9,13 +9,16 @@ export $WORKDIR= cd $WORKDIR git clone https://github.com/opea-project/GenAIEval.git ``` -2. Create conda environment and install packages +2. Build docker image ``` -conda create -n crag-benchmark-env python=3.11 -conda activate crag-benchmark-env -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/ -pip install -r requirements.txt +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/ +bash build_image.sh ``` +3. Start docker container +``` +bash launch_eval_container.sh +``` + ## CRAG dataset 1. Download original data and process it with commands below. You need to create an account on the Meta CRAG challenge [website](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024). After login, go to this [link](https://www.aicrowd.com/challenges/meta-comprehensive-rag-benchmark-kdd-cup-2024/problems/meta-kdd-cup-24-crag-end-to-end-retrieval-augmented-generation/dataset_files) and download the `crag_task_3_dev_v4.tar.bz2` file. Then make a `datasets` directory in your work directory using the commands below. @@ -74,6 +77,21 @@ cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark bash run_generate_answer.sh ``` 2. Use LLM-as-judge to grade the answers +First, in another terminal, launch llm endpoint with HF TGI +``` +cd llm_judge +bash launch_llm_judge_endpoint.sh +``` +Validate that the llm endpoint is working properly. +``` +export host_ip=$(hostname -I | awk '{print $1}') +curl ${host_ip}:8085/generate_stream \ + -X POST \ + -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ + -H 'Content-Type: application/json' +``` +Second, back to the interactive crag-eval docker, run command below ``` +bash run_grading.sh ``` diff --git a/evals/evaluation/crag_eval/requirements.txt b/evals/evaluation/crag_eval/requirements.txt deleted file mode 100644 index 1411a4a0..00000000 --- a/evals/evaluation/crag_eval/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -pandas \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index b4110a5e..35413e8d 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -1,8 +1,46 @@ -from evals.metrics.answer_relevancy import AnswerRelevancyMetric from evals.metrics.ragas import RagasMetric -from evals.metrics import bleu_score, rougeL_score +import argparse +import pandas as pd +import os + +def convert_data_format_for_ragas(data): + # data: pandas dataframe + # columns: ['query', 'answer', 'ref_answer'] + # return: a dict with keys: 'input', 'actual_output', 'expected_output' + output = { + 'input': data['query'].tolist(), + 'actual_output': data['answer'].tolist(), + 'expected_output': data['ref_answer'].tolist(), + 'retrieval_context': data['ref_answer'].tolist() + } + return output + + +def grade_answers(args, test_case): + from langchain_community.embeddings import HuggingFaceBgeEmbeddings + print('==============getting embeddings==============') + embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model) + print('==============initiating metric==============') + metric = RagasMetric(threshold=0.5, + metrics=["answer_correctness"], + model= args.llm_endpoint, + embeddings=embeddings) + print('==============start grading==============') + metric.measure(test_case) + print(metric.score) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--embed_model", type=str, default="BAAI/bge-base-en-v1.5") + parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008") + parser.add_argument("--filedir", type=str, help="Path to the file containing the data") + parser.add_argument("--filename", type=str, help="Name of the file containing the data") + args = parser.parse_args() + + data = pd.read_csv(os.path.join(args.filedir, args.filename)) + data = data.head(2) + print(data) + test_case = convert_data_format_for_ragas(data) + print(test_case) + grade_answers(args, test_case) -# check data format requirements for each metric -# check answer relevancy vs ragas answer relevancy -# check answer correctness -# check if open-source llm can be used diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml new file mode 100644 index 00000000..4f6f7f23 --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml @@ -0,0 +1,19 @@ +version: "3.8" + +services: + tgi_service: + image: ghcr.io/huggingface/text-generation-inference:2.1.0 + container_name: tgi-service + ports: + - "8085:80" + volumes: + - ${HF_CACHE_DIR}:/data + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + command: --model-id ${LLM_MODEL_ID} \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh new file mode 100644 index 00000000..d927cc05 --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh @@ -0,0 +1,4 @@ +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" # +export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} +export HF_CACHE_DIR=${HF_CACHE_DIR} +docker compose -f docker-compose-llm-judge.yaml up -d \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh new file mode 100644 index 00000000..ff5a22f7 --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh @@ -0,0 +1,8 @@ +FILEDIR=$WORKDIR/datasets/crag_results/ +FILENAME=crag_20queries_react_docgradertool_top5apis_v2sysm_gpt4omini.csv +LLM_ENDPOINT=http://${host_ip}:8085 + +python3 grade_answers.py \ +--filedir $FILEDIR \ +--filename $FILENAME \ +--llm_endpoint $LLM_ENDPOINT \ From d57f9d5c26428bae0899bdb677c0ce5335055ef8 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 09:53:40 -0700 Subject: [PATCH 03/18] fix answer correctness code Signed-off-by: minmin-intel --- evals/evaluation/crag_eval/README.md | 12 +++++++-- .../crag_eval/run_benchmark/grade_answers.py | 3 ++- evals/metrics/ragas/ragas.py | 25 +++++++++++++++++-- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md index 52e2e028..aa7097be 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/crag_eval/README.md @@ -14,7 +14,15 @@ git clone https://github.com/opea-project/GenAIEval.git cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/ bash build_image.sh ``` -3. Start docker container +3. Set environment vars for downloading models from Huggingface +``` +mkdir $WORKDIR/hf_cache +export HF_CACHE_DIR=$WORKDIR/hf_cache +export HF_HOME=$HF_CACHE_DIR +export HUGGINGFACEHUB_API_TOKEN= +``` +4. Start docker container +This container will be used to preprocess dataset and run benchmark scripts. ``` bash launch_eval_container.sh ``` @@ -42,7 +50,7 @@ The CRAG dataset has more than 4000 queries, and running all of them can be very ``` bash run_sample_data.sh ``` -3. Use the small subset that we have processed for a quick run +4. Use the small subset that we have processed for a quick run ``` Small data files in this repo ``` diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index 35413e8d..21c16db3 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -1,4 +1,5 @@ from evals.metrics.ragas import RagasMetric +from ragas.metrics import answer_correctness import argparse import pandas as pd import os @@ -11,7 +12,7 @@ def convert_data_format_for_ragas(data): 'input': data['query'].tolist(), 'actual_output': data['answer'].tolist(), 'expected_output': data['ref_answer'].tolist(), - 'retrieval_context': data['ref_answer'].tolist() + 'retrieval_context': [["dummy_context"] for _ in range(data['query'].shape[0])] } return output diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 4069a62c..945b8671 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -16,6 +16,27 @@ def format_ragas_metric_name(name: str): return f"{name} (ragas)" +def get_metric(name: str): + validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"] + if name == "answer_relevancy": + from ragas.metrics import answer_relevancy + return answer_relevancy + elif name == "faithfulness": + from ragas.metrics import faithfulness + return faithfulness + elif name == "answer_correctness": + from ragas.metrics import answer_correctness + return answer_correctness + else: + raise ValueError( + "metric should be in supported list {}. ".format(validated_list) + + "ClientResponseError raised with LangchainLLM " + + "when context_precision, context_recall ran. " + + "Here are the related issues described in ragas " + "https://github.com/explodinggradients/ragas/issues/934, " + + "https://github.com/explodinggradients/ragas/issues/664." + ) + class RagasMetric: """This metric checks if the output is more than 3 letters.""" @@ -41,7 +62,7 @@ def measure(self, test_case: Dict): # sends to server try: from ragas import evaluate - from ragas.metrics import answer_relevancy, faithfulness + from ragas.metrics import answer_relevancy, faithfulness, answer_correctness except ModuleNotFoundError: raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") @@ -85,7 +106,7 @@ def measure(self, test_case: Dict): else: if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") - tmp_metrics.append(metric) + tmp_metrics.append(get_metric(metric)) self.metrics = tmp_metrics else: self.metrics = [ From a61efa8f127297e6871d9cfe3a18ecd0e1548530 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 09:59:45 -0700 Subject: [PATCH 04/18] docker container for crag eval --- evals/evaluation/crag_eval/docker/Dockerfile | 23 +++++++++++++++++++ .../crag_eval/docker/build_image.sh | 9 ++++++++ .../crag_eval/docker/launch_eval_container.sh | 4 ++++ .../crag_eval/docker/requirements.txt | 7 ++++++ 4 files changed, 43 insertions(+) create mode 100644 evals/evaluation/crag_eval/docker/Dockerfile create mode 100644 evals/evaluation/crag_eval/docker/build_image.sh create mode 100644 evals/evaluation/crag_eval/docker/launch_eval_container.sh create mode 100644 evals/evaluation/crag_eval/docker/requirements.txt diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/crag_eval/docker/Dockerfile new file mode 100644 index 00000000..0421000a --- /dev/null +++ b/evals/evaluation/crag_eval/docker/Dockerfile @@ -0,0 +1,23 @@ +FROM ubuntu:22.04 + +WORKDIR /home/user + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + python3.11 \ + python3-pip \ + libpoppler-cpp-dev \ + wget \ + git \ + poppler-utils \ + libmkl-dev + +COPY requirements.txt /home/user/requirements.txt + +RUN pip install -r requirements.txt + +RUN cd /home/user/ && \ + git clone https://github.com/opea-project/GenAIEval.git + +ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIEval/ + +WORKDIR /home/user \ No newline at end of file diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/crag_eval/docker/build_image.sh new file mode 100644 index 00000000..284762c7 --- /dev/null +++ b/evals/evaluation/crag_eval/docker/build_image.sh @@ -0,0 +1,9 @@ +dockerfile=Dockerfile + +docker build \ + -f ${dockerfile} . \ + -t crag-eval:latest \ + --network=host \ + --build-arg http_proxy=${http_proxy} \ + --build-arg https_proxy=${https_proxy} \ + --build-arg no_proxy=${no_proxy} \ \ No newline at end of file diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/crag_eval/docker/launch_eval_container.sh new file mode 100644 index 00000000..32b27f8e --- /dev/null +++ b/evals/evaluation/crag_eval/docker/launch_eval_container.sh @@ -0,0 +1,4 @@ +volume=$WORKDIR +host_ip=$(hostname -I | awk '{print $1}') + +docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest \ No newline at end of file diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt new file mode 100644 index 00000000..662f3146 --- /dev/null +++ b/evals/evaluation/crag_eval/docker/requirements.txt @@ -0,0 +1,7 @@ +pandas +datasets +ragas +langchain-community +evaluate +jieba +sentence_transformers \ No newline at end of file From 914cdc2a68b6307425445bb752250cb788ffc70d Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 10:00:15 -0700 Subject: [PATCH 05/18] sample data for testing --- .../run_benchmark/sample_answers.csv | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 evals/evaluation/crag_eval/run_benchmark/sample_answers.csv diff --git a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv new file mode 100644 index 00000000..790f6f6d --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv @@ -0,0 +1,28 @@ +query,query_time,ref_answer,answer,context,num_llm_calls,total_tokens,question_type,static_or_dynamic,selected_tools +how many reading and leeds festivals has the band foo fighters headlined?,"03/13/2024, 09:55:12 PT",foo fighters have headlined a total of 4 reading and leeds festivals.,"The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.","[[{'id': 'call_05BuB1KtO8uuJypQh6lecBVM', 'function': {'arguments': '{""query"":""Foo Fighters Reading and Leeds Festival headlining history""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Foo Fighters have a notable history of headlining the Reading and Leeds Festivals. They first headlined the festival in 2002 and returned to headline again in 2005. After a significant gap, they headlined once more in 2012. In 2019, they were announced as one of the headliners again, marking their return to the festival after several years. Overall, they have been a prominent presence at the festival since their first appearance in 1995.', 'The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.']",3,959,aggregation,slow-changing,"['get_members', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist']" +how many songs has the band the beatles released that have been recorded at abbey road studios?,"03/13/2024, 09:54:32 PT",the beatles have released 190 songs that have been recorded at abbey road studios.,"The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.","[[{'id': 'call_wthw0YH3cxU3yzYMdhfyY2OI', 'function': {'arguments': '{""query"":""The Beatles songs recorded at Abbey Road Studios""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Beatles recorded most of their music at Abbey Road Studios, formerly known as EMI Recording Studios, from their inception in 1957 until their break-up in 1970. This includes iconic albums such as ""Abbey Road,"" ""Sgt. Pepper\'s Lonely Hearts Club Band,"" and ""The White Album."" The studio is renowned for its cultural and historical significance in the music industry, particularly for the Beatles\' extensive catalog.', 'The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.']",3,968,aggregation,static,"['get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_release_country', 'get_members', 'search_song_entity_by_name']" +did the song the globe come out before the wedding?,"03/21/2024, 23:32:29 PT",yes,"The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""","[[{'id': 'call_7wKEamOXFKVmki6gklVACm0M', 'function': {'arguments': '{""song_name"": ""The Globe""}', 'name': 'get_song_release_date'}, 'type': 'function'}, {'id': 'call_rfOh7EoCSkyDRGPe7DvgKMFE', 'function': {'arguments': '{""song_name"": ""The Wedding""}', 'name': 'get_song_release_date'}, 'type': 'function'}], '{""result"": ""1991""}', '{""result"": ""2016-09-23""}', 'The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""']",2,839,comparison,static,"['get_song_release_country', 'search_song_entity_by_name', 'get_song_release_date', 'get_grammy_best_song_by_year', 'get_song_author']" +"who has had more number one hits on the us billboard hot 100 chart, michael jackson or elvis presley?","03/13/2024, 09:43:49 PT","elvis presley has had more number one hits on the us billboard hot 100 chart than michael jackson, with 18 number one hits compared to jackson's 13.","Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.","[[{'id': 'call_i5H8SAsGgRTXnvI0sDm5wmrv', 'function': {'arguments': '{""query"": ""Michael Jackson number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}, {'id': 'call_Mgec4kbeE0z4h3QhJ0ENNTHU', 'function': {'arguments': '{""query"": ""Elvis Presley number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Michael Jackson had 13 number-one hits on the US Billboard Hot 100.', 'Elvis Presley had 18 number one hits on the US Billboard Hot 100 chart. His number one singles spent a total of 81 weeks at the top position, which is the most of any artist.', ""Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.""]",4,1136,comparison,static,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'search_song_entity_by_name', 'get_song_release_country']" +when was alex van halen the drummer for the band santana?,"03/13/2024, 10:04:31 PT",invalid question,Invalid question.,['Invalid question.'],1,343,false_premise,static,"['get_members', 'get_song_author', 'get_song_release_date', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_song']" +"when did dolly parton's song, blown away, come out?","03/10/2024, 23:28:06 PT",invalid question,"Carrie Underwood's song ""Blown Away"" was released on May 1, 2012.","[[{'id': 'call_7ZIr0njVYoqlfvoCdd6pLmHb', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_release_country'}, 'type': 'function'}], '{""result"": ""ID""}', [{'id': 'call_ZUYqfFFb4VRGVHwhrG3nsV8L', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_author'}, 'type': 'function'}], '{""result"": ""Carrie Underwood""}', [{'id': 'call_mA2bAMtysJClId77OoUan9yO', 'function': {'arguments': '{""query"":""Blown Away Carrie Underwood release date""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Carrie Underwood\'s album ""Blown Away"" was released on May 1, 2012.', 'Carrie Underwood\'s song ""Blown Away"" was released on May 1, 2012.']",5,1798,false_premise,static,"['get_grammy_award_count_by_song', 'get_song_author', 'get_song_release_country', 'get_grammy_best_song_by_year', 'get_grammy_award_date_by_artist']" +what grammy award did edgar barrera win this year?,"03/05/2024, 23:25:22 PT",invalid question,"Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.","[[{'id': 'call_ptgqp9zavnUvzSqmBNPNa8R3', 'function': {'arguments': '{""artist_name"":""Edgar Barrera""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": [2014]}', 'Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.']",2,828,false_premise,static,"['get_grammy_award_date_by_artist', 'get_grammy_best_artist_by_year', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_artist', 'get_grammy_best_song_by_year']" +when did miley cyrus win grammy best new artist award?,"03/21/2024, 23:38:14 PT",invalid question,Miley Cyrus has not won the Grammy Best New Artist award.,"[[{'id': 'call_wCWNt9LOduR5IESTc1jeB6n4', 'function': {'arguments': '{""artist_name"":""Miley Cyrus""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', 'Miley Cyrus has not won the Grammy Best New Artist award.']",2,790,false_premise,static,"['get_all_awarded_artists', 'get_grammy_best_artist_by_year', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']" +when was stevie wonder the keyboardist for the band the rolling stones?,"03/13/2024, 10:01:54 PT",invalid question,Invalid question.,['Invalid question.'],1,355,false_premise,static,"['get_grammy_award_date_by_artist', 'get_artist_birth_date', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song', 'get_artist_birth_place']" +who is nicki minaj's latest diss track about?,"03/10/2024, 23:32:04 PT","nicki minaj's song ""big foot"" is a diss track about megan thee stallion.","Nicki Minaj's latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion's song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said 'diss track'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.","[[{'id': 'call_trQnbJbaT3p5iPBASlNNFK0s', 'function': {'arguments': '{""query"":""Nicki Minaj latest diss track 2024""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Nicki Minaj\'s latest track, titled ""Big Foot,"" has been released in early 2024 and is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many are labeling it a diss track, Nicki herself has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The song features lyrics that appear to take shots at Megan, but Nicki has pushed back against the diss track label.', 'Nicki Minaj\'s latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.']",3,1001,multi-hop,fast-changing,"['get_song_author', 'get_all_awarded_artists', 'get_song_release_date', 'get_grammy_best_artist_by_year', 'search_song_entity_by_name']" +what's the most recent album from the founder of ysl records?,"03/05/2024, 23:28:07 PT",business is business,"Sorry, need more steps to process this request.","[[{'id': 'call_2yMQLyoFwYCt2KTxpiQD9I4k', 'function': {'arguments': '{""artist_name"":""Young Stoner Life Records""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Stoner Life"", ""Young Life"", ""Stoner"", ""Re*Cords"", ""Alyson Stoner"", ""Art Stoner"", ""Billy Stoner"", ""Brian Stoner"", ""Colin Stoner"", ""David Stoner""]}', [{'id': 'call_dBz3tCbgZz7C9jSKczWKUP5T', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Thug"", ""Young T.H.U.G."", ""Thug"", ""T.H.U.G"", ""T.H.U.G."", ""Arden Thug"", ""Babs Thug"", ""Bambie Thug"", ""blond thug"", ""Cali Thug""]}', [{'id': 'call_GbuvwSrKem9ZbExvw6gf1U8I', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', [{'id': 'call_NQ22bfeZLvSY61NxL2YP1YBb', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', 'Sorry, need more steps to process this request.']",5,500,multi-hop,slow-changing,"['get_song_author', 'search_artist_entity_by_name', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_artist_all_works']" +what's the most recent album from the puerto rican artist that's been in wwe?,"03/10/2024, 23:31:52 PT",nadie sabe lo que va a pasar mañana,"The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.","[[{'id': 'call_b8OrYeD5gSQjBOnUjJvtg6Y8', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Bad Bunny"", ""Bunny"", ""All Bad Bunny Everything"", ""bAd bAd"", ""The Bad Bad Bad"", ""Bad"", ""B.A.D."", ""BAD!"", ""BAD"", ""Bad Bad Bird""]}', [{'id': 'call_jKkRPtMcqc8Pp9127rxo3jV9', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_count_by_artist'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_7PySlZykLTc7aHyLSi5EdMmJ', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', [{'id': 'call_21vZE6bAa9CfdWONNgwEYBAU', 'function': {'arguments': '{""query"":""Bad Bunny discography""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Bad Bunny\'s discography includes several studio albums, with his latest being ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023. This album features 22 tracks and includes collaborations with artists such as Young Miko, Feid, Arcángel, Bryant Myers, and Eladio Carrión. His previous albums include ""Un verano sin ti"" and others that have contributed to his recognition as a leading artist in the Latin music scene.', 'The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.']",6,2614,multi-hop,slow-changing,"['get_all_awarded_artists', 'get_grammy_award_count_by_artist', 'get_grammy_award_date_by_artist', 'search_artist_entity_by_name', 'get_grammy_best_artist_by_year']" +who has played drums for the red hot chili peppers?,"03/17/2024, 17:08:10 PT","chad smith, jack irons, and cliff martinez","The drummers who have played for the Red Hot Chili Peppers include: + +1. Chad Smith +2. Jack Irons +3. Cliff Martinez +4. D.H. Peligro + +These are some of the notable drummers associated with the band.","[[{'id': 'call_O5mU9etmvrZ2rpqe38tvglh8', 'function': {'arguments': '{""band_name"":""Red Hot Chili Peppers""}', 'name': 'get_members'}, 'type': 'function'}], '{""result"": [""John Frusciante"", ""Flea"", ""D.H. Peligro"", ""DeWayne \\""Blackbyrd\\"" McKnight"", ""Arik Marshall"", ""Josh Klinghoffer"", ""Chad Smith"", ""Dave Navarro"", ""Hillel Slovak"", ""Jack Irons"", ""Jack Sherman"", ""Cliff Martinez"", ""Anthony Kiedis"", ""Jesse Tobias""]}', 'The drummers who have played for the Red Hot Chili Peppers include:\n\n1. Chad Smith\n2. Jack Irons\n3. Cliff Martinez\n4. D.H. Peligro\n\nThese are some of the notable drummers associated with the band.']",2,858,set,static,"['get_members', 'get_song_author', 'get_artist_birth_place', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']" +how many tracks are in drake's last album?,"03/12/2024, 12:28:41 PT",23,"Drake's last album, ""For All the Dogs,"" has a total of 23 tracks.","[[{'id': 'call_nXv914FUcVGCYeyW2MdmigSz', 'function': {'arguments': '{""query"":""Drake last album track count""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Drake\'s last album, ""For All the Dogs,"" has a track count of 23.', 'Drake\'s last album, ""For All the Dogs,"" has a total of 23 tracks.']",3,852,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_song_release_date', 'get_song_release_country']" +how many number one hits did taylor swift have?,"03/12/2024, 12:28:35 PT",11,"As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.","[[{'id': 'call_AeMTBYLShMlu3PzfQztMCxns', 'function': {'arguments': '{""query"":""Taylor Swift number one hits""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'As of the Hot 100 dated February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.', 'As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.']",3,877,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'get_grammy_award_date_by_artist', 'search_song_entity_by_name']" +where is marem ladson's original home country?,"03/21/2024, 23:41:53 PT",es,Marem Ladson's original home country is Spain (ES).,"[[{'id': 'call_Zey99OPDixDXc3nUHZnD5MVC', 'function': {'arguments': '{""artist_name"":""Marem Ladson""}', 'name': 'get_artist_birth_place'}, 'type': 'function'}], '{""result"": ""ES""}', ""Marem Ladson's original home country is Spain (ES).""]",2,716,simple,static,"['get_artist_birth_place', 'get_song_release_country', 'get_song_author', 'get_artist_birth_date', 'search_song_entity_by_name']" +tell me the most recent song or album by doris duke?,"03/21/2024, 23:40:20 PT",i’m a loser: the swamp dogg sessions... and more,"The most recent album by Doris Duke is ""I'm A Loser,"" which was released on February 4, 2021.","[[{'id': 'call_IBn6alF1JcL2QNYxDvkd9Nla', 'function': {'arguments': '{""song_name"":""Doris Duke""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Passion Flower (For Doris Duke)"", ""Doris"", ""Black Doris"", ""Clara / Doris"", ""Doris 100"", ""Doris Day"", ""Doris Machin"", ""Doris & Me"", ""Doris Rising"", ""Doris Troy""]}', [{'id': 'call_rfPHKHEPZ1UntCcHBZBOeQtM', 'function': {'arguments': '{""query"":""Doris Duke most recent song or album""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Doris Duke\'s most recent album is ""I\'m A Loser,"" which was released on February 4, 2021.', 'The most recent album by Doris Duke is ""I\'m A Loser,"" which was released on February 4, 2021.']",4,1418,simple_w_condition,fast-changing,"['search_song_entity_by_name', 'get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_artist']" +what song topped the billboard chart on 2004-02-04?,"03/21/2024, 23:34:49 PT",hey ya!,"The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.","[[{'id': 'call_nwInngToFLLUFMeKYuR5lwhx', 'function': {'arguments': '{""rank"":1,""date"":""2004-02-04""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Hey Ya!""], [""OutKast""]]}', 'The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.']",2,794,simple_w_condition,fast-changing,"['get_billboard_attributes', 'get_song_release_date', 'search_song_entity_by_name', 'get_song_release_country', 'get_billboard_rank_date']" +what is the most popular song on billboard in 2024-02-28?,"03/21/2024, 23:34:42 PT",texas hold 'em,"The most popular song on Billboard on February 28, 2024, was ""Texas Hold 'Em"" by Beyonce.","[[{'id': 'call_ctgHbLk8zIctCE0qOgjtcTU7', 'function': {'arguments': '{""rank"":1,""date"":""2024-02-28""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Texas Hold \'Em""], [""Beyonce""]]}', 'The most popular song on Billboard on February 28, 2024, was ""Texas Hold \'Em"" by Beyonce.']",2,798,simple_w_condition,fast-changing,"['get_song_release_date', 'get_billboard_attributes', 'get_song_release_country', 'search_song_entity_by_name', 'get_billboard_rank_date']" +"what album did maroon five release in 2010, which included the songs ""moves like jagger"" and ""misery""?","03/13/2024, 09:49:01 PT","maroon five released the album ""hands all over"" in 2010, which included the songs ""moves like jagger"" and ""misery"".","Sorry, need more steps to process this request.","[[{'id': 'call_TiKh63jRcIYXYAsXxrzUB0Ip', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Moves Like Jagger"", ""Moves Like Jagger (remix)"", ""Moves Like Jagger (Remixes)"", ""D.N.A. Moves Like Jagger Mashup"", ""Moves Like Jagger / Jumpin\' Jack Flash"", ""Moves Like Jagger (The Voice Performance)"", ""Move Like Jagger"", ""Foxy Jagger"", ""Jagger \'67"", ""Peter Jagger""]}', [{'id': 'call_KlfgNIKNzIjm9qAwpWXOkl5n', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Misery"", ""MISERY"", ""misery"", ""Misery..."", ""Artificial Misery"", ""Beautiful Misery"", ""Blissful Misery"", ""Burn // Misery"", ""Collected Misery"", ""Concret\\u00e9 Misery""]}', [{'id': 'call_NePkutCu9BVaeeMEV2xJNXhE', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_0fGXDvgbkvbEHMnRNBdIfK7i', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', 'Sorry, need more steps to process this request.']",5,500,simple_w_condition,static,"['get_song_release_date', 'get_song_release_country', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_song', 'search_song_entity_by_name']" From 1b49b1b6a05f56d3adb9eb42f61c022f49090d67 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 10:17:05 -0700 Subject: [PATCH 06/18] docker compose for tgi gaudi Signed-off-by: minmin-intel --- .../docker-compose-llm-judge-gaudi.yaml | 22 +++++++++++++++++++ .../llm_judge/launch_llm_judge_endpoint.sh | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml new file mode 100644 index 00000000..a3eb311e --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml @@ -0,0 +1,22 @@ +services: + tgi-service: + image: ghcr.io/huggingface/tgi-gaudi:latest + container_name: tgi-gaudi-server + ports: + - "8085:80" + volumes: + - ${HF_CACHE_DIR}:/data + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HF_HUB_DISABLE_PROGRESS_BARS: 1 + HF_HUB_ENABLE_HF_TRANSFER: 0 + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + runtime: habana + cap_add: + - SYS_NICE + ipc: host + command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh index d927cc05..56ce8ab9 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh @@ -1,4 +1,4 @@ -export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" # +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" # export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HF_CACHE_DIR=${HF_CACHE_DIR} -docker compose -f docker-compose-llm-judge.yaml up -d \ No newline at end of file +docker compose -f docker-compose-llm-judge-gaudi.yaml up -d \ No newline at end of file From 7867461208b4abe4e68ac5912c5130f03f406b33 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 10:24:24 -0700 Subject: [PATCH 07/18] fix tgi gaudi docker compose for llama3 70b --- .../llm_judge/docker-compose-llm-judge-gaudi.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml index a3eb311e..3bed748f 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml @@ -1,7 +1,7 @@ services: tgi-service: image: ghcr.io/huggingface/tgi-gaudi:latest - container_name: tgi-gaudi-server + container_name: tgi-server ports: - "8085:80" volumes: @@ -15,8 +15,9 @@ services: HF_HUB_ENABLE_HF_TRANSFER: 0 HABANA_VISIBLE_DEVICES: all OMPI_MCA_btl_vader_single_copy_mechanism: none + PT_HPU_ENABLE_LAZY_COLLECTIVES: true runtime: habana cap_add: - SYS_NICE ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 \ No newline at end of file + command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --sharded true --num-shard 4 \ No newline at end of file From 8e999dcd420f1a6be92fd409da78e5f2ebcaaa6f Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 26 Aug 2024 11:33:05 -0700 Subject: [PATCH 08/18] update llm eval code Signed-off-by: minmin-intel --- evals/evaluation/crag_eval/docker/Dockerfile | 3 +- .../crag_eval/docker/requirements.txt | 1 + .../crag_eval/run_benchmark/grade_answers.py | 32 ++++++++++++++++--- evals/metrics/ragas/ragas.py | 11 +++++-- 4 files changed, 39 insertions(+), 8 deletions(-) diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/crag_eval/docker/Dockerfile index 0421000a..a3a97c5b 100644 --- a/evals/evaluation/crag_eval/docker/Dockerfile +++ b/evals/evaluation/crag_eval/docker/Dockerfile @@ -9,7 +9,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ wget \ git \ poppler-utils \ - libmkl-dev + libmkl-dev \ + curl COPY requirements.txt /home/user/requirements.txt diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt index 662f3146..4579eca7 100644 --- a/evals/evaluation/crag_eval/docker/requirements.txt +++ b/evals/evaluation/crag_eval/docker/requirements.txt @@ -2,6 +2,7 @@ pandas datasets ragas langchain-community +langchain-huggingface evaluate jieba sentence_transformers \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index 21c16db3..a3d08322 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -17,6 +17,22 @@ def convert_data_format_for_ragas(data): return output +def make_list_of_test_cases(data): + # data: pandas dataframe + # columns: ['query', 'answer', 'ref_answer'] + # return: a dict with keys: 'input', 'actual_output', 'expected_output' + output = [] + for _, row in data.iterrows(): + output.append( + { + 'input': [row['query']], + 'actual_output': [row['answer']], + 'expected_output': [row['ref_answer']], + 'retrieval_context': [["dummy_context"]] + } + ) + return output + def grade_answers(args, test_case): from langchain_community.embeddings import HuggingFaceBgeEmbeddings print('==============getting embeddings==============') @@ -27,8 +43,14 @@ def grade_answers(args, test_case): model= args.llm_endpoint, embeddings=embeddings) print('==============start grading==============') - metric.measure(test_case) - print(metric.score) + scores = [] + for case in test_case: + metric.measure(case) + scores.append(metric.score) + print(metric.score) + print('-'*50) + # metric.measure(test_case) + return scores if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -41,7 +63,9 @@ def grade_answers(args, test_case): data = pd.read_csv(os.path.join(args.filedir, args.filename)) data = data.head(2) print(data) - test_case = convert_data_format_for_ragas(data) + # test_case = convert_data_format_for_ragas(data) + test_case = make_list_of_test_cases(data) print(test_case) - grade_answers(args, test_case) + scores = grade_answers(args, test_case) + print(scores) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 945b8671..ac717d92 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -7,7 +7,7 @@ import os from typing import Dict, Optional, Union -from langchain_community.llms import HuggingFaceEndpoint +from langchain_huggingface import HuggingFaceEndpoint from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel @@ -82,10 +82,15 @@ def measure(self, test_case: Dict): print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.") self.model = None if isinstance(self.model, str): + print('LLM endpoint: ', self.model) chat_model = HuggingFaceEndpoint( endpoint_url=self.model, - timeout=600, + task="text-generation", + max_new_tokens=1024, + do_sample=False, ) + print('Validating LLM endpoint....') + chat_model.invoke("Hello!") else: chat_model = self.model # Create a dataset from the test case @@ -128,7 +133,7 @@ def measure(self, test_case: Dict): llm=chat_model, embeddings=self.embeddings, ) - print(self.score) + # print(self.score) return self.score def is_successful(self): From 7b9b9b204686e68ab2cb09532ec85bf7560b46c6 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Tue, 27 Aug 2024 10:46:32 -0700 Subject: [PATCH 09/18] allow per sample grading Signed-off-by: minmin-intel --- .../crag_eval/run_benchmark/grade_answers.py | 30 +++++++----- .../docker-compose-llm-judge-gaudi.yaml | 2 +- .../llm_judge/launch_llm_judge_endpoint.sh | 2 +- .../llm_judge/test_llm_endpoint.py | 15 ++++++ evals/metrics/ragas/ragas.py | 46 ++++++++----------- 5 files changed, 56 insertions(+), 39 deletions(-) create mode 100644 evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index a3d08322..1741354c 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -20,7 +20,7 @@ def convert_data_format_for_ragas(data): def make_list_of_test_cases(data): # data: pandas dataframe # columns: ['query', 'answer', 'ref_answer'] - # return: a dict with keys: 'input', 'actual_output', 'expected_output' + # return: a list of dicts with keys: 'input', 'actual_output', 'expected_output' output = [] for _, row in data.iterrows(): output.append( @@ -43,14 +43,18 @@ def grade_answers(args, test_case): model= args.llm_endpoint, embeddings=embeddings) print('==============start grading==============') - scores = [] - for case in test_case: - metric.measure(case) - scores.append(metric.score) - print(metric.score) - print('-'*50) - # metric.measure(test_case) - return scores + + if args.batch_grade: + metric.measure(test_case) + return metric.score + else: + scores = [] + for case in test_case: + metric.measure(case) + scores.append(metric.score) + print(metric.score) + print('-'*50) + return scores if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -58,13 +62,17 @@ def grade_answers(args, test_case): parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008") parser.add_argument("--filedir", type=str, help="Path to the file containing the data") parser.add_argument("--filename", type=str, help="Name of the file containing the data") + parser.add_argument("--batch_grade", action="store_true", help="Grade the answers in batch and get an aggregated score for the entire dataset") args = parser.parse_args() data = pd.read_csv(os.path.join(args.filedir, args.filename)) data = data.head(2) print(data) - # test_case = convert_data_format_for_ragas(data) - test_case = make_list_of_test_cases(data) + if args.batch_grade: + test_case = convert_data_format_for_ragas(data) + else: + test_case = make_list_of_test_cases(data) + print(test_case) scores = grade_answers(args, test_case) print(scores) diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml index 3bed748f..56b1e6dd 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml @@ -20,4 +20,4 @@ services: cap_add: - SYS_NICE ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096 --sharded true --num-shard 4 \ No newline at end of file + command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4 \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh index 56ce8ab9..5b13d01d 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh @@ -1,4 +1,4 @@ -export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" #"meta-llama/Meta-Llama-3-70B-Instruct" # +export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HF_CACHE_DIR=${HF_CACHE_DIR} docker compose -f docker-compose-llm-judge-gaudi.yaml up -d \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py new file mode 100644 index 00000000..ce961173 --- /dev/null +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py @@ -0,0 +1,15 @@ +from langchain_huggingface import HuggingFaceEndpoint +import os + +host_ip=os.environ.get("host_ip", "localhost") +url = "http://{host_ip}:8085".format(host_ip=host_ip) +print(url) + +model = HuggingFaceEndpoint( + endpoint_url=url, + task="text-generation", + max_new_tokens=10, + do_sample=False, +) + +print(model.invoke("what is deep learing?")) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index ac717d92..d8e218dc 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -54,28 +54,11 @@ def __init__( self.metrics = metrics self.validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"] - async def a_measure(self, test_case: Dict): - return self.measure(test_case) - - def measure(self, test_case: Dict): - - # sends to server try: - from ragas import evaluate from ragas.metrics import answer_relevancy, faithfulness, answer_correctness - except ModuleNotFoundError: raise ModuleNotFoundError("Please install ragas to use this metric. `pip install ragas`.") - try: - from datasets import Dataset - except ModuleNotFoundError: - raise ModuleNotFoundError("Please install dataset") - self.metrics_instance = { - "answer_relevancy": answer_relevancy, - "faithfulness": faithfulness, - } - # Set LLM model openai_key = os.getenv("OPENAI_API_KEY", None) if openai_key is not None: @@ -83,18 +66,16 @@ def measure(self, test_case: Dict): self.model = None if isinstance(self.model, str): print('LLM endpoint: ', self.model) - chat_model = HuggingFaceEndpoint( + self.chat_model = HuggingFaceEndpoint( endpoint_url=self.model, task="text-generation", max_new_tokens=1024, do_sample=False, ) - print('Validating LLM endpoint....') - chat_model.invoke("Hello!") else: - chat_model = self.model - # Create a dataset from the test case - # Convert the Dict to a format compatible with Dataset + self.chat_model = self.model + + # initialize metrics if self.metrics is not None: tmp_metrics = [] # check supported list @@ -113,12 +94,26 @@ def measure(self, test_case: Dict): raise ValueError("answer_relevancy metric need provide embeddings model.") tmp_metrics.append(get_metric(metric)) self.metrics = tmp_metrics - else: + else: # default metrics self.metrics = [ answer_relevancy, faithfulness, + answer_correctness, ] + + async def a_measure(self, test_case: Dict): + return self.measure(test_case) + + def measure(self, test_case: Dict): + from ragas import evaluate + try: + from datasets import Dataset + except ModuleNotFoundError: + raise ModuleNotFoundError("Please install dataset") + + # Create a dataset from the test case + # Convert the Dict to a format compatible with Dataset data = { "question": test_case["input"], "contexts": test_case["retrieval_context"], @@ -130,10 +125,9 @@ def measure(self, test_case: Dict): self.score = evaluate( dataset, metrics=self.metrics, - llm=chat_model, + llm=self.chat_model, embeddings=self.embeddings, ) - # print(self.score) return self.score def is_successful(self): From 5c58b729065a9f60e810c7bf80d218c4b84af1bd Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Tue, 27 Aug 2024 10:56:18 -0700 Subject: [PATCH 10/18] save graded scores Signed-off-by: minmin-intel --- .../crag_eval/run_benchmark/grade_answers.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index 1741354c..3aaaba5c 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -46,12 +46,12 @@ def grade_answers(args, test_case): if args.batch_grade: metric.measure(test_case) - return metric.score + return metric.score['answer_correctness'] else: scores = [] for case in test_case: metric.measure(case) - scores.append(metric.score) + scores.append(metric.score['answer_correctness']) print(metric.score) print('-'*50) return scores @@ -66,7 +66,7 @@ def grade_answers(args, test_case): args = parser.parse_args() data = pd.read_csv(os.path.join(args.filedir, args.filename)) - data = data.head(2) + # data = data.head(2) print(data) if args.batch_grade: test_case = convert_data_format_for_ragas(data) @@ -74,6 +74,17 @@ def grade_answers(args, test_case): test_case = make_list_of_test_cases(data) print(test_case) + scores = grade_answers(args, test_case) - print(scores) + + # save the scores + if args.batch_grade: + print("Aggregated answer correctness score: ", scores) + else: + data['answer_correctness'] = scores + output_file = args.filename.split('.')[0] + '_graded.csv' + data.to_csv(os.path.join(args.filedir, output_file), index=False) + print("Scores saved to ", os.path.join(args.filedir, args.output)) + + From d043dded61f9b89d59d96bb774928e5917be711c Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Tue, 27 Aug 2024 11:30:27 -0700 Subject: [PATCH 11/18] ipdate readme Signed-off-by: minmin-intel --- evals/evaluation/crag_eval/README.md | 24 ++++++++-------- .../preprocess_data/run_data_preprocess.sh | 2 +- .../preprocess_data/run_sample_data.sh | 2 +- .../crag_eval/run_benchmark/grade_answers.py | 8 +++--- .../run_benchmark/run_generate_answer.sh | 8 +++--- .../crag_eval/run_benchmark/run_grading.sh | 2 +- .../run_benchmark/sample_answers.csv | 28 ------------------- 7 files changed, 24 insertions(+), 50 deletions(-) delete mode 100644 evals/evaluation/crag_eval/run_benchmark/sample_answers.csv diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md index aa7097be..a277c136 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/crag_eval/README.md @@ -50,10 +50,7 @@ The CRAG dataset has more than 4000 queries, and running all of them can be very ``` bash run_sample_data.sh ``` -4. Use the small subset that we have processed for a quick run -``` -Small data files in this repo -``` + ## Launch agent QnA system Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark. 1. Build images @@ -77,20 +74,19 @@ bash 4_launch_and_validate_agent.sh ``` ## Run CRAG benchmark -Once you have your agent system up and running, you can follow the steps below to run the benchmark. -1. Generate answers with agent -Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain. +Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain. ``` cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark bash run_generate_answer.sh ``` -2. Use LLM-as-judge to grade the answers -First, in another terminal, launch llm endpoint with HF TGI + +## Use LLM-as-judge to grade the answers +1. Launch llm endpoint with HF TGI: in another terminal, run the command below. By default, `meta-llama/Meta-Llama-3-70B-Instruct` is used as the LLM judge. ``` cd llm_judge bash launch_llm_judge_endpoint.sh ``` -Validate that the llm endpoint is working properly. +2. Validate that the llm endpoint is working properly. ``` export host_ip=$(hostname -I | awk '{print $1}') curl ${host_ip}:8085/generate_stream \ @@ -98,8 +94,14 @@ curl ${host_ip}:8085/generate_stream \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ -H 'Content-Type: application/json' ``` -Second, back to the interactive crag-eval docker, run command below +And then go back to the interactive crag-eval docker, run command below. +``` +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/ +python3 test_llm_endpoint.py +``` +3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py). ``` +cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/ bash run_grading.sh ``` diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh index 8ecfa94f..fc6f19be 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh @@ -2,4 +2,4 @@ FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4 DOCOUT=$WORKDIR/datasets/crag_docs QAOUT=$WORKDIR/datasets/crag_qas -python process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT \ No newline at end of file +python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT \ No newline at end of file diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh index f0bce763..06514702 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh @@ -1,3 +1,3 @@ FILEDIR=$WORKDIR/datasets/crag_docs -python sample_data.py --filedir $FILEDIR \ No newline at end of file +python3 sample_data.py --filedir $FILEDIR \ No newline at end of file diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index 3aaaba5c..ddeecdb6 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -66,14 +66,13 @@ def grade_answers(args, test_case): args = parser.parse_args() data = pd.read_csv(os.path.join(args.filedir, args.filename)) - # data = data.head(2) - print(data) + if args.batch_grade: test_case = convert_data_format_for_ragas(data) else: test_case = make_list_of_test_cases(data) - print(test_case) + # print(test_case) scores = grade_answers(args, test_case) @@ -82,9 +81,10 @@ def grade_answers(args, test_case): print("Aggregated answer correctness score: ", scores) else: data['answer_correctness'] = scores + print("Average answer correctness score: ", data['answer_correctness'].mean()) output_file = args.filename.split('.')[0] + '_graded.csv' data.to_csv(os.path.join(args.filedir, output_file), index=False) - print("Scores saved to ", os.path.join(args.filedir, args.output)) + print("Scores saved to ", os.path.join(args.filedir, output_file)) diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh index 24b477ee..d4529fdb 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh @@ -1,13 +1,13 @@ -host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the service -port=9095 # change this to the port of the service -endpoint=${port}/v1/chat/completions # change this to the endpoint of the service +host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent +port=9095 # change this to the port of the agent +endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent URL="http://${host_ip}:${endpoint}" echo "AGENT ENDPOINT URL: ${URL}" QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl -python generate_answers.py \ +python3 generate_answers.py \ --endpoint_url ${URL} \ --query_file $QUERYFILE \ --output_file $OUTPUTFILE diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh index ff5a22f7..82cae435 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh @@ -1,5 +1,5 @@ FILEDIR=$WORKDIR/datasets/crag_results/ -FILENAME=crag_20queries_react_docgradertool_top5apis_v2sysm_gpt4omini.csv +FILENAME=results.csv LLM_ENDPOINT=http://${host_ip}:8085 python3 grade_answers.py \ diff --git a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv b/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv deleted file mode 100644 index 790f6f6d..00000000 --- a/evals/evaluation/crag_eval/run_benchmark/sample_answers.csv +++ /dev/null @@ -1,28 +0,0 @@ -query,query_time,ref_answer,answer,context,num_llm_calls,total_tokens,question_type,static_or_dynamic,selected_tools -how many reading and leeds festivals has the band foo fighters headlined?,"03/13/2024, 09:55:12 PT",foo fighters have headlined a total of 4 reading and leeds festivals.,"The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.","[[{'id': 'call_05BuB1KtO8uuJypQh6lecBVM', 'function': {'arguments': '{""query"":""Foo Fighters Reading and Leeds Festival headlining history""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Foo Fighters have a notable history of headlining the Reading and Leeds Festivals. They first headlined the festival in 2002 and returned to headline again in 2005. After a significant gap, they headlined once more in 2012. In 2019, they were announced as one of the headliners again, marking their return to the festival after several years. Overall, they have been a prominent presence at the festival since their first appearance in 1995.', 'The Foo Fighters have headlined the Reading and Leeds Festivals four times: in 2002, 2005, 2012, and 2019.']",3,959,aggregation,slow-changing,"['get_members', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist']" -how many songs has the band the beatles released that have been recorded at abbey road studios?,"03/13/2024, 09:54:32 PT",the beatles have released 190 songs that have been recorded at abbey road studios.,"The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.","[[{'id': 'call_wthw0YH3cxU3yzYMdhfyY2OI', 'function': {'arguments': '{""query"":""The Beatles songs recorded at Abbey Road Studios""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'The Beatles recorded most of their music at Abbey Road Studios, formerly known as EMI Recording Studios, from their inception in 1957 until their break-up in 1970. This includes iconic albums such as ""Abbey Road,"" ""Sgt. Pepper\'s Lonely Hearts Club Band,"" and ""The White Album."" The studio is renowned for its cultural and historical significance in the music industry, particularly for the Beatles\' extensive catalog.', 'The Beatles recorded most of their music at Abbey Road Studios, but the exact number of songs they released that were recorded there is not specified in the available information. However, it is known that they recorded a significant portion of their catalog at this studio from 1957 to 1970.']",3,968,aggregation,static,"['get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_release_country', 'get_members', 'search_song_entity_by_name']" -did the song the globe come out before the wedding?,"03/21/2024, 23:32:29 PT",yes,"The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""","[[{'id': 'call_7wKEamOXFKVmki6gklVACm0M', 'function': {'arguments': '{""song_name"": ""The Globe""}', 'name': 'get_song_release_date'}, 'type': 'function'}, {'id': 'call_rfOh7EoCSkyDRGPe7DvgKMFE', 'function': {'arguments': '{""song_name"": ""The Wedding""}', 'name': 'get_song_release_date'}, 'type': 'function'}], '{""result"": ""1991""}', '{""result"": ""2016-09-23""}', 'The song ""The Globe"" was released in 1991, while ""The Wedding"" was released on September 23, 2016. Therefore, ""The Globe"" came out before ""The Wedding.""']",2,839,comparison,static,"['get_song_release_country', 'search_song_entity_by_name', 'get_song_release_date', 'get_grammy_best_song_by_year', 'get_song_author']" -"who has had more number one hits on the us billboard hot 100 chart, michael jackson or elvis presley?","03/13/2024, 09:43:49 PT","elvis presley has had more number one hits on the us billboard hot 100 chart than michael jackson, with 18 number one hits compared to jackson's 13.","Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.","[[{'id': 'call_i5H8SAsGgRTXnvI0sDm5wmrv', 'function': {'arguments': '{""query"": ""Michael Jackson number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}, {'id': 'call_Mgec4kbeE0z4h3QhJ0ENNTHU', 'function': {'arguments': '{""query"": ""Elvis Presley number one hits on US Billboard Hot 100""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Michael Jackson had 13 number-one hits on the US Billboard Hot 100.', 'Elvis Presley had 18 number one hits on the US Billboard Hot 100 chart. His number one singles spent a total of 81 weeks at the top position, which is the most of any artist.', ""Elvis Presley had more number one hits on the US Billboard Hot 100 chart, with a total of 18 hits, compared to Michael Jackson's 13 number one hits.""]",4,1136,comparison,static,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'search_song_entity_by_name', 'get_song_release_country']" -when was alex van halen the drummer for the band santana?,"03/13/2024, 10:04:31 PT",invalid question,Invalid question.,['Invalid question.'],1,343,false_premise,static,"['get_members', 'get_song_author', 'get_song_release_date', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_song']" -"when did dolly parton's song, blown away, come out?","03/10/2024, 23:28:06 PT",invalid question,"Carrie Underwood's song ""Blown Away"" was released on May 1, 2012.","[[{'id': 'call_7ZIr0njVYoqlfvoCdd6pLmHb', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_release_country'}, 'type': 'function'}], '{""result"": ""ID""}', [{'id': 'call_ZUYqfFFb4VRGVHwhrG3nsV8L', 'function': {'arguments': '{""song_name"":""Blown Away""}', 'name': 'get_song_author'}, 'type': 'function'}], '{""result"": ""Carrie Underwood""}', [{'id': 'call_mA2bAMtysJClId77OoUan9yO', 'function': {'arguments': '{""query"":""Blown Away Carrie Underwood release date""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Carrie Underwood\'s album ""Blown Away"" was released on May 1, 2012.', 'Carrie Underwood\'s song ""Blown Away"" was released on May 1, 2012.']",5,1798,false_premise,static,"['get_grammy_award_count_by_song', 'get_song_author', 'get_song_release_country', 'get_grammy_best_song_by_year', 'get_grammy_award_date_by_artist']" -what grammy award did edgar barrera win this year?,"03/05/2024, 23:25:22 PT",invalid question,"Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.","[[{'id': 'call_ptgqp9zavnUvzSqmBNPNa8R3', 'function': {'arguments': '{""artist_name"":""Edgar Barrera""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": [2014]}', 'Edgar Barrera won a Grammy award in 2014, but there is no record of him winning a Grammy award in 2024.']",2,828,false_premise,static,"['get_grammy_award_date_by_artist', 'get_grammy_best_artist_by_year', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_artist', 'get_grammy_best_song_by_year']" -when did miley cyrus win grammy best new artist award?,"03/21/2024, 23:38:14 PT",invalid question,Miley Cyrus has not won the Grammy Best New Artist award.,"[[{'id': 'call_wCWNt9LOduR5IESTc1jeB6n4', 'function': {'arguments': '{""artist_name"":""Miley Cyrus""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', 'Miley Cyrus has not won the Grammy Best New Artist award.']",2,790,false_premise,static,"['get_all_awarded_artists', 'get_grammy_best_artist_by_year', 'get_grammy_award_date_by_artist', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']" -when was stevie wonder the keyboardist for the band the rolling stones?,"03/13/2024, 10:01:54 PT",invalid question,Invalid question.,['Invalid question.'],1,355,false_premise,static,"['get_grammy_award_date_by_artist', 'get_artist_birth_date', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song', 'get_artist_birth_place']" -who is nicki minaj's latest diss track about?,"03/10/2024, 23:32:04 PT","nicki minaj's song ""big foot"" is a diss track about megan thee stallion.","Nicki Minaj's latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion's song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said 'diss track'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.","[[{'id': 'call_trQnbJbaT3p5iPBASlNNFK0s', 'function': {'arguments': '{""query"":""Nicki Minaj latest diss track 2024""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Nicki Minaj\'s latest track, titled ""Big Foot,"" has been released in early 2024 and is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many are labeling it a diss track, Nicki herself has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The song features lyrics that appear to take shots at Megan, but Nicki has pushed back against the diss track label.', 'Nicki Minaj\'s latest track, titled ""Big Foot,"" released in early 2024, is widely speculated to be a response to Megan Thee Stallion\'s song ""Hiss."" While many consider it a diss track, Nicki has denied that it is intended as such, stating, ""Who said \'diss track\'??????"" The lyrics seem to take shots at Megan, but Nicki has pushed back against the diss track label.']",3,1001,multi-hop,fast-changing,"['get_song_author', 'get_all_awarded_artists', 'get_song_release_date', 'get_grammy_best_artist_by_year', 'search_song_entity_by_name']" -what's the most recent album from the founder of ysl records?,"03/05/2024, 23:28:07 PT",business is business,"Sorry, need more steps to process this request.","[[{'id': 'call_2yMQLyoFwYCt2KTxpiQD9I4k', 'function': {'arguments': '{""artist_name"":""Young Stoner Life Records""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Stoner Life"", ""Young Life"", ""Stoner"", ""Re*Cords"", ""Alyson Stoner"", ""Art Stoner"", ""Billy Stoner"", ""Brian Stoner"", ""Colin Stoner"", ""David Stoner""]}', [{'id': 'call_dBz3tCbgZz7C9jSKczWKUP5T', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Young Thug"", ""Young T.H.U.G."", ""Thug"", ""T.H.U.G"", ""T.H.U.G."", ""Arden Thug"", ""Babs Thug"", ""Bambie Thug"", ""blond thug"", ""Cali Thug""]}', [{'id': 'call_GbuvwSrKem9ZbExvw6gf1U8I', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', [{'id': 'call_NQ22bfeZLvSY61NxL2YP1YBb', 'function': {'arguments': '{""artist_name"":""Young Thug""}', 'name': 'get_artist_all_works'}, 'type': 'function'}], '{""result"": [""All in a Day"", ""1017 Thug"", ""Black Portland"", ""Thuga Thuga"", ""1017 Thug 2"", ""1017 Thug 3: The Finale"", ""I Came From Nothing 3"", ""I Came From Nothing 2"", ""Barter 6"", ""Slime Season 2"", ""Slime Season 3"", ""I Came From Nothing"", ""Gangster Shit"", ""JEFFERY"", ""Old English"", ""Gang Up"", ""Beautiful Thugger Girls"", ""Young Martha"", ""Danny Glover"", ""Slime Language"", ""On the Rvn!!"", ""The London"", ""Thugga Thugga"", ""Metro Thuggin"", ""Hot (remix)"", ""Slime Season 4"", ""Cheat Code Mode"", ""Trap Zuse"", ""2 Bitches"", ""Young Rich Homie Thuggin"", ""Tell Her Nothing"", ""Stoner (remixes)"", ""Liger"", ""Brick Sqaud"", ""I Came from Nothing 2"", ""A Nightmare From Beyond"", ""Street Is Watching (The Mix-Tape)"", ""BUSINESS IS BUSINESS"", ""From a Man"", ""From A Man""]}', 'Sorry, need more steps to process this request.']",5,500,multi-hop,slow-changing,"['get_song_author', 'search_artist_entity_by_name', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_artist_all_works']" -what's the most recent album from the puerto rican artist that's been in wwe?,"03/10/2024, 23:31:52 PT",nadie sabe lo que va a pasar mañana,"The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.","[[{'id': 'call_b8OrYeD5gSQjBOnUjJvtg6Y8', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'search_artist_entity_by_name'}, 'type': 'function'}], '{""result"": [""Bad Bunny"", ""Bunny"", ""All Bad Bunny Everything"", ""bAd bAd"", ""The Bad Bad Bad"", ""Bad"", ""B.A.D."", ""BAD!"", ""BAD"", ""Bad Bad Bird""]}', [{'id': 'call_jKkRPtMcqc8Pp9127rxo3jV9', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_count_by_artist'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_7PySlZykLTc7aHyLSi5EdMmJ', 'function': {'arguments': '{""artist_name"":""Bad Bunny""}', 'name': 'get_grammy_award_date_by_artist'}, 'type': 'function'}], '{""result"": []}', [{'id': 'call_21vZE6bAa9CfdWONNgwEYBAU', 'function': {'arguments': '{""query"":""Bad Bunny discography""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Bad Bunny\'s discography includes several studio albums, with his latest being ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023. This album features 22 tracks and includes collaborations with artists such as Young Miko, Feid, Arcángel, Bryant Myers, and Eladio Carrión. His previous albums include ""Un verano sin ti"" and others that have contributed to his recognition as a leading artist in the Latin music scene.', 'The most recent album from the Puerto Rican artist Bad Bunny, who has been in WWE, is ""Nadie sabe lo que va a pasar mañana,"" released on October 13, 2023.']",6,2614,multi-hop,slow-changing,"['get_all_awarded_artists', 'get_grammy_award_count_by_artist', 'get_grammy_award_date_by_artist', 'search_artist_entity_by_name', 'get_grammy_best_artist_by_year']" -who has played drums for the red hot chili peppers?,"03/17/2024, 17:08:10 PT","chad smith, jack irons, and cliff martinez","The drummers who have played for the Red Hot Chili Peppers include: - -1. Chad Smith -2. Jack Irons -3. Cliff Martinez -4. D.H. Peligro - -These are some of the notable drummers associated with the band.","[[{'id': 'call_O5mU9etmvrZ2rpqe38tvglh8', 'function': {'arguments': '{""band_name"":""Red Hot Chili Peppers""}', 'name': 'get_members'}, 'type': 'function'}], '{""result"": [""John Frusciante"", ""Flea"", ""D.H. Peligro"", ""DeWayne \\""Blackbyrd\\"" McKnight"", ""Arik Marshall"", ""Josh Klinghoffer"", ""Chad Smith"", ""Dave Navarro"", ""Hillel Slovak"", ""Jack Irons"", ""Jack Sherman"", ""Cliff Martinez"", ""Anthony Kiedis"", ""Jesse Tobias""]}', 'The drummers who have played for the Red Hot Chili Peppers include:\n\n1. Chad Smith\n2. Jack Irons\n3. Cliff Martinez\n4. D.H. Peligro\n\nThese are some of the notable drummers associated with the band.']",2,858,set,static,"['get_members', 'get_song_author', 'get_artist_birth_place', 'get_grammy_award_count_by_artist', 'get_grammy_award_count_by_song']" -how many tracks are in drake's last album?,"03/12/2024, 12:28:41 PT",23,"Drake's last album, ""For All the Dogs,"" has a total of 23 tracks.","[[{'id': 'call_nXv914FUcVGCYeyW2MdmigSz', 'function': {'arguments': '{""query"":""Drake last album track count""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Drake\'s last album, ""For All the Dogs,"" has a track count of 23.', 'Drake\'s last album, ""For All the Dogs,"" has a total of 23 tracks.']",3,852,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_all_awarded_artists', 'get_song_release_date', 'get_song_release_country']" -how many number one hits did taylor swift have?,"03/12/2024, 12:28:35 PT",11,"As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.","[[{'id': 'call_AeMTBYLShMlu3PzfQztMCxns', 'function': {'arguments': '{""query"":""Taylor Swift number one hits""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'As of the Hot 100 dated February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.', 'As of February 24, 2024, Taylor Swift has 11 number one hits on the Billboard Hot 100.']",3,877,simple,slow-changing,"['get_grammy_award_count_by_song', 'get_grammy_award_count_by_artist', 'get_song_author', 'get_grammy_award_date_by_artist', 'search_song_entity_by_name']" -where is marem ladson's original home country?,"03/21/2024, 23:41:53 PT",es,Marem Ladson's original home country is Spain (ES).,"[[{'id': 'call_Zey99OPDixDXc3nUHZnD5MVC', 'function': {'arguments': '{""artist_name"":""Marem Ladson""}', 'name': 'get_artist_birth_place'}, 'type': 'function'}], '{""result"": ""ES""}', ""Marem Ladson's original home country is Spain (ES).""]",2,716,simple,static,"['get_artist_birth_place', 'get_song_release_country', 'get_song_author', 'get_artist_birth_date', 'search_song_entity_by_name']" -tell me the most recent song or album by doris duke?,"03/21/2024, 23:40:20 PT",i’m a loser: the swamp dogg sessions... and more,"The most recent album by Doris Duke is ""I'm A Loser,"" which was released on February 4, 2021.","[[{'id': 'call_IBn6alF1JcL2QNYxDvkd9Nla', 'function': {'arguments': '{""song_name"":""Doris Duke""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Passion Flower (For Doris Duke)"", ""Doris"", ""Black Doris"", ""Clara / Doris"", ""Doris 100"", ""Doris Day"", ""Doris Machin"", ""Doris & Me"", ""Doris Rising"", ""Doris Troy""]}', [{'id': 'call_rfPHKHEPZ1UntCcHBZBOeQtM', 'function': {'arguments': '{""query"":""Doris Duke most recent song or album""}', 'name': 'search_knowledge_base'}, 'type': 'function'}], 'Doris Duke\'s most recent album is ""I\'m A Loser,"" which was released on February 4, 2021.', 'The most recent album by Doris Duke is ""I\'m A Loser,"" which was released on February 4, 2021.']",4,1418,simple_w_condition,fast-changing,"['search_song_entity_by_name', 'get_grammy_award_count_by_song', 'get_song_release_date', 'get_song_author', 'get_grammy_award_count_by_artist']" -what song topped the billboard chart on 2004-02-04?,"03/21/2024, 23:34:49 PT",hey ya!,"The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.","[[{'id': 'call_nwInngToFLLUFMeKYuR5lwhx', 'function': {'arguments': '{""rank"":1,""date"":""2004-02-04""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Hey Ya!""], [""OutKast""]]}', 'The song that topped the Billboard chart on February 4, 2004, was ""Hey Ya!"" by OutKast.']",2,794,simple_w_condition,fast-changing,"['get_billboard_attributes', 'get_song_release_date', 'search_song_entity_by_name', 'get_song_release_country', 'get_billboard_rank_date']" -what is the most popular song on billboard in 2024-02-28?,"03/21/2024, 23:34:42 PT",texas hold 'em,"The most popular song on Billboard on February 28, 2024, was ""Texas Hold 'Em"" by Beyonce.","[[{'id': 'call_ctgHbLk8zIctCE0qOgjtcTU7', 'function': {'arguments': '{""rank"":1,""date"":""2024-02-28""}', 'name': 'get_billboard_rank_date'}, 'type': 'function'}], '{""result"": [[""Texas Hold \'Em""], [""Beyonce""]]}', 'The most popular song on Billboard on February 28, 2024, was ""Texas Hold \'Em"" by Beyonce.']",2,798,simple_w_condition,fast-changing,"['get_song_release_date', 'get_billboard_attributes', 'get_song_release_country', 'search_song_entity_by_name', 'get_billboard_rank_date']" -"what album did maroon five release in 2010, which included the songs ""moves like jagger"" and ""misery""?","03/13/2024, 09:49:01 PT","maroon five released the album ""hands all over"" in 2010, which included the songs ""moves like jagger"" and ""misery"".","Sorry, need more steps to process this request.","[[{'id': 'call_TiKh63jRcIYXYAsXxrzUB0Ip', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Moves Like Jagger"", ""Moves Like Jagger (remix)"", ""Moves Like Jagger (Remixes)"", ""D.N.A. Moves Like Jagger Mashup"", ""Moves Like Jagger / Jumpin\' Jack Flash"", ""Moves Like Jagger (The Voice Performance)"", ""Move Like Jagger"", ""Foxy Jagger"", ""Jagger \'67"", ""Peter Jagger""]}', [{'id': 'call_KlfgNIKNzIjm9qAwpWXOkl5n', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'search_song_entity_by_name'}, 'type': 'function'}], '{""result"": [""Misery"", ""MISERY"", ""misery"", ""Misery..."", ""Artificial Misery"", ""Beautiful Misery"", ""Blissful Misery"", ""Burn // Misery"", ""Collected Misery"", ""Concret\\u00e9 Misery""]}', [{'id': 'call_NePkutCu9BVaeeMEV2xJNXhE', 'function': {'arguments': '{""song_name"":""moves like jagger""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', [{'id': 'call_0fGXDvgbkvbEHMnRNBdIfK7i', 'function': {'arguments': '{""song_name"":""misery""}', 'name': 'get_grammy_award_count_by_song'}, 'type': 'function'}], '{""result"": 0}', 'Sorry, need more steps to process this request.']",5,500,simple_w_condition,static,"['get_song_release_date', 'get_song_release_country', 'get_grammy_best_album_by_year', 'get_grammy_award_count_by_song', 'search_song_entity_by_name']" From 9556f9a189ad51e3fbfcafc7bcb551aceb3d812f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 21:31:38 +0000 Subject: [PATCH 12/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/crag_eval/README.md | 1 - .../crag_eval/docker/build_image.sh | 5 +- .../crag_eval/docker/launch_eval_container.sh | 5 +- .../crag_eval/docker/requirements.txt | 10 +-- .../crag_eval/preprocess_data/process_data.py | 58 +++++++++------- .../preprocess_data/run_data_preprocess.sh | 5 +- .../preprocess_data/run_sample_data.sh | 5 +- .../crag_eval/preprocess_data/sample_data.py | 24 ++++--- .../run_benchmark/generate_answers.py | 55 ++++++++------- .../crag_eval/run_benchmark/grade_answers.py | 67 ++++++++++--------- .../docker-compose-llm-judge-gaudi.yaml | 5 +- .../llm_judge/docker-compose-llm-judge.yaml | 5 +- .../llm_judge/launch_llm_judge_endpoint.sh | 5 +- .../llm_judge/test_llm_endpoint.py | 8 ++- .../run_benchmark/run_generate_answer.sh | 3 + .../crag_eval/run_benchmark/run_grading.sh | 3 + evals/metrics/ragas/ragas.py | 30 +++++---- 17 files changed, 178 insertions(+), 116 deletions(-) diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md index a277c136..1e5ce1c2 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/crag_eval/README.md @@ -104,4 +104,3 @@ python3 test_llm_endpoint.py cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/ bash run_grading.sh ``` - diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/crag_eval/docker/build_image.sh index 284762c7..a743900f 100644 --- a/evals/evaluation/crag_eval/docker/build_image.sh +++ b/evals/evaluation/crag_eval/docker/build_image.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + dockerfile=Dockerfile docker build \ @@ -6,4 +9,4 @@ docker build \ --network=host \ --build-arg http_proxy=${http_proxy} \ --build-arg https_proxy=${https_proxy} \ - --build-arg no_proxy=${no_proxy} \ \ No newline at end of file + --build-arg no_proxy=${no_proxy} \ diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/crag_eval/docker/launch_eval_container.sh index 32b27f8e..8698f452 100644 --- a/evals/evaluation/crag_eval/docker/launch_eval_container.sh +++ b/evals/evaluation/crag_eval/docker/launch_eval_container.sh @@ -1,4 +1,7 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + volume=$WORKDIR host_ip=$(hostname -I | awk '{print $1}') -docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest \ No newline at end of file +docker run -it -v $volume:/home/user/ -e WORKDIR=/home/user -e HF_HOME=/home/user/hf_cache -e host_ip=$host_ip -e http_proxy=$http_proxy -e https_proxy=$https_proxy crag-eval:latest diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/crag_eval/docker/requirements.txt index 4579eca7..b32606b7 100644 --- a/evals/evaluation/crag_eval/docker/requirements.txt +++ b/evals/evaluation/crag_eval/docker/requirements.txt @@ -1,8 +1,8 @@ -pandas datasets -ragas -langchain-community -langchain-huggingface evaluate jieba -sentence_transformers \ No newline at end of file +langchain-community +langchain-huggingface +pandas +ragas +sentence_transformers diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py index 6f0c6775..2f5dea00 100644 --- a/evals/evaluation/crag_eval/preprocess_data/process_data.py +++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py @@ -1,10 +1,16 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse import json import os -import argparse + import tqdm + def split_text(text, chunk_size=2000, chunk_overlap=400): from langchain_text_splitters import RecursiveCharacterTextSplitter + text_splitter = RecursiveCharacterTextSplitter( # Set a really small chunk size, just to show. chunk_size=chunk_size, @@ -15,14 +21,16 @@ def split_text(text, chunk_size=2000, chunk_overlap=400): ) return text_splitter.split_text(text) + def process_html_string(text): from bs4 import BeautifulSoup + # print(text) soup = BeautifulSoup(text, features="html.parser") # kill all script and style elements for script in soup(["script", "style"]): - script.extract() # rip it out + script.extract() # rip it out # get text text_content = soup.get_text() @@ -32,21 +40,22 @@ def process_html_string(text): # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines - final_text = '\n'.join(chunk for chunk in chunks if chunk) + final_text = "\n".join(chunk for chunk in chunks if chunk) # print(final_text) return final_text + def preprocess_data(input_file): snippet = [] return_data = [] n = 0 - with open(input_file, 'r') as f: + with open(input_file, "r") as f: for line in f: data = json.loads(line) # search results snippets --> retrieval corpus docs - docs = data['search_results'] - + docs = data["search_results"] + for doc in docs: # chunks = split_text(doc['page_snippet']) # for chunk in chunks: @@ -54,30 +63,27 @@ def preprocess_data(input_file): # "query": data['query'], # "domain": data['domain'], # "doc":chunk}) - snippet.append({ - "query": data['query'], - "domain": data['domain'], - "doc":doc['page_snippet']}) - print('-----------------------------------') - + snippet.append({"query": data["query"], "domain": data["domain"], "doc": doc["page_snippet"]}) + print("-----------------------------------") + # qa pairs without search results output = {} for k, v in data.items(): - if k != 'search_results': + if k != "search_results": output[k] = v return_data.append(output) return snippet, return_data - -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--filedir', type=str, default=None) - parser.add_argument('--docout', type=str, default=None) - parser.add_argument('--qaout', type=str, default=None) + parser.add_argument("--filedir", type=str, default=None) + parser.add_argument("--docout", type=str, default=None) + parser.add_argument("--qaout", type=str, default=None) # parser.add_argument('--chunk_size', type=int, default=10000) # parser.add_argument('--chunk_overlap', type=int, default=0) - + args = parser.parse_args() if not os.path.exists(args.docout): @@ -95,17 +101,17 @@ def preprocess_data(input_file): doc, data = preprocess_data(file) docs.extend(doc) qa_pairs.extend(data) - + # group by domain domains = ["finance", "music", "movie", "sports", "open"] for domain in domains: - with open(os.path.join(args.docout, "crag_docs_"+domain + ".jsonl"), 'w') as f: + with open(os.path.join(args.docout, "crag_docs_" + domain + ".jsonl"), "w") as f: for doc in docs: - if doc['doc']!="" and doc['domain'] == domain: - f.write(json.dumps(doc) + '\n') + if doc["doc"] != "" and doc["domain"] == domain: + f.write(json.dumps(doc) + "\n") - with open(os.path.join(args.qaout, "crag_qa_"+domain + ".jsonl"), 'w') as f: + with open(os.path.join(args.qaout, "crag_qa_" + domain + ".jsonl"), "w") as f: for d in qa_pairs: - if d['domain'] == domain: - f.write(json.dumps(d) + '\n') \ No newline at end of file + if d["domain"] == domain: + f.write(json.dumps(d) + "\n") diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh index fc6f19be..d93a5fa1 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh @@ -1,5 +1,8 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4 DOCOUT=$WORKDIR/datasets/crag_docs QAOUT=$WORKDIR/datasets/crag_qas -python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT \ No newline at end of file +python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh index 06514702..e0e3b0c8 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + FILEDIR=$WORKDIR/datasets/crag_docs -python3 sample_data.py --filedir $FILEDIR \ No newline at end of file +python3 sample_data.py --filedir $FILEDIR diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py index 1fcb4916..f4aa8209 100644 --- a/evals/evaluation/crag_eval/preprocess_data/sample_data.py +++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py @@ -1,26 +1,32 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse import json -import pandas as pd import os -import argparse + +import pandas as pd import tqdm + def sample_data(input_file, output_file): df = pd.read_json(input_file, lines=True, convert_dates=False) # group by `question_type` and `static_or_dynamic` - df_grouped = df.groupby(['question_type', 'static_or_dynamic']) + df_grouped = df.groupby(["question_type", "static_or_dynamic"]) # sample 5 rows from each group if there are more than 5 rows else return all rows df_sampled = df_grouped.apply(lambda x: x.sample(5) if len(x) > 5 else x) # save sampled data to output file - df_sampled.to_json(output_file, orient='records', lines=True) + df_sampled.to_json(output_file, orient="records", lines=True) -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('--filedir', type=str, default=None) - + parser.add_argument("--filedir", type=str, default=None) + args = parser.parse_args() data_files = os.listdir(args.filedir) for file in tqdm.tqdm(data_files): file = os.path.join(args.filedir, file) - output_file = file.replace('.jsonl', '_sampled.jsonl') - sample_data(file, output_file) \ No newline at end of file + output_file = file.replace(".jsonl", "_sampled.jsonl") + sample_data(file, output_file) diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py index 6c55da3c..bad9b768 100644 --- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py @@ -1,42 +1,51 @@ -import requests -import json +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse +import json import os + import pandas as pd +import requests + def get_test_data(args): - if args.query_file.endswith('.jsonl'): + if args.query_file.endswith(".jsonl"): df = pd.read_json(args.query_file, lines=True, convert_dates=False) - elif args.query_file.endswith('.csv'): + elif args.query_file.endswith(".csv"): df = pd.read_csv(args.query_file) return df + def generate_answer(url, prompt): proxies = {"http": ""} payload = { - "query":prompt, - } + "query": prompt, + } response = requests.post(url, json=payload, proxies=proxies) answer = response.json()["text"] return answer -def save_results(output_file, output_list): + +def save_results(output_file, output_list): with open(output_file, "w") as f: for output in output_list: f.write(json.dumps(output)) f.write("\n") + def save_as_csv(output): df = pd.read_json(output, lines=True, convert_dates=False) df.to_csv(output.replace(".jsonl", ".csv"), index=False) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--endpoint_url", type=str, default=None, help="url of the agent QnA system endpoint") parser.add_argument("--query_file", type=str, default=None, help="query jsonl file") parser.add_argument("--output_file", type=str, default="output.jsonl", help="output jsonl file") args = parser.parse_args() - + url = args.endpoint_url df = get_test_data(args) @@ -45,27 +54,27 @@ def save_as_csv(output): output_list = [] n = 0 for _, row in df.iterrows(): - q = row['query'] - t = row['query_time'] + q = row["query"] + t = row["query_time"] prompt = "Question: {}\nThe question was asked at: {}".format(q, t) - print('******Query:\n',prompt) + print("******Query:\n", prompt) print("******Agent is working on the query") answer = generate_answer(url, prompt) - print('******Answer from agent:\n',answer) - print('='*50) + print("******Answer from agent:\n", answer) + print("=" * 50) output_list.append( - { - "query": q, - "query_time": t, - "ref_answer": row["answer"], - "answer": answer, - "question_type": row["question_type"], - "static_or_dynamic": row["static_or_dynamic"], - } - ) + { + "query": q, + "query_time": t, + "ref_answer": row["answer"], + "answer": answer, + "question_type": row["question_type"], + "static_or_dynamic": row["static_or_dynamic"], + } + ) save_results(args.output_file, output_list) # n += 1 # if n > 1: # break save_results(args.output_file, output_list) - save_as_csv(args.output_file) \ No newline at end of file + save_as_csv(args.output_file) diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py index ddeecdb6..094a87aa 100644 --- a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/grade_answers.py @@ -1,18 +1,24 @@ -from evals.metrics.ragas import RagasMetric -from ragas.metrics import answer_correctness +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import argparse -import pandas as pd import os +import pandas as pd +from ragas.metrics import answer_correctness + +from evals.metrics.ragas import RagasMetric + + def convert_data_format_for_ragas(data): # data: pandas dataframe # columns: ['query', 'answer', 'ref_answer'] # return: a dict with keys: 'input', 'actual_output', 'expected_output' output = { - 'input': data['query'].tolist(), - 'actual_output': data['answer'].tolist(), - 'expected_output': data['ref_answer'].tolist(), - 'retrieval_context': [["dummy_context"] for _ in range(data['query'].shape[0])] + "input": data["query"].tolist(), + "actual_output": data["answer"].tolist(), + "expected_output": data["ref_answer"].tolist(), + "retrieval_context": [["dummy_context"] for _ in range(data["query"].shape[0])], } return output @@ -25,44 +31,48 @@ def make_list_of_test_cases(data): for _, row in data.iterrows(): output.append( { - 'input': [row['query']], - 'actual_output': [row['answer']], - 'expected_output': [row['ref_answer']], - 'retrieval_context': [["dummy_context"]] + "input": [row["query"]], + "actual_output": [row["answer"]], + "expected_output": [row["ref_answer"]], + "retrieval_context": [["dummy_context"]], } ) return output + def grade_answers(args, test_case): from langchain_community.embeddings import HuggingFaceBgeEmbeddings - print('==============getting embeddings==============') + + print("==============getting embeddings==============") embeddings = HuggingFaceBgeEmbeddings(model_name=args.embed_model) - print('==============initiating metric==============') - metric = RagasMetric(threshold=0.5, - metrics=["answer_correctness"], - model= args.llm_endpoint, - embeddings=embeddings) - print('==============start grading==============') + print("==============initiating metric==============") + metric = RagasMetric(threshold=0.5, metrics=["answer_correctness"], model=args.llm_endpoint, embeddings=embeddings) + print("==============start grading==============") if args.batch_grade: metric.measure(test_case) - return metric.score['answer_correctness'] + return metric.score["answer_correctness"] else: scores = [] for case in test_case: metric.measure(case) - scores.append(metric.score['answer_correctness']) + scores.append(metric.score["answer_correctness"]) print(metric.score) - print('-'*50) + print("-" * 50) return scores -if __name__ == '__main__': + +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--embed_model", type=str, default="BAAI/bge-base-en-v1.5") parser.add_argument("--llm_endpoint", type=str, default="http://localhost:8008") parser.add_argument("--filedir", type=str, help="Path to the file containing the data") parser.add_argument("--filename", type=str, help="Name of the file containing the data") - parser.add_argument("--batch_grade", action="store_true", help="Grade the answers in batch and get an aggregated score for the entire dataset") + parser.add_argument( + "--batch_grade", + action="store_true", + help="Grade the answers in batch and get an aggregated score for the entire dataset", + ) args = parser.parse_args() data = pd.read_csv(os.path.join(args.filedir, args.filename)) @@ -71,7 +81,7 @@ def grade_answers(args, test_case): test_case = convert_data_format_for_ragas(data) else: test_case = make_list_of_test_cases(data) - + # print(test_case) scores = grade_answers(args, test_case) @@ -80,11 +90,8 @@ def grade_answers(args, test_case): if args.batch_grade: print("Aggregated answer correctness score: ", scores) else: - data['answer_correctness'] = scores - print("Average answer correctness score: ", data['answer_correctness'].mean()) - output_file = args.filename.split('.')[0] + '_graded.csv' + data["answer_correctness"] = scores + print("Average answer correctness score: ", data["answer_correctness"].mean()) + output_file = args.filename.split(".")[0] + "_graded.csv" data.to_csv(os.path.join(args.filedir, output_file), index=False) print("Scores saved to ", os.path.join(args.filedir, output_file)) - - - diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml index 56b1e6dd..572011ef 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + services: tgi-service: image: ghcr.io/huggingface/tgi-gaudi:latest @@ -20,4 +23,4 @@ services: cap_add: - SYS_NICE ipc: host - command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4 \ No newline at end of file + command: --model-id ${LLM_MODEL_ID} --max-input-length 4096 --max-total-tokens 8192 --sharded true --num-shard 4 diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml index 4f6f7f23..a954098e 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + version: "3.8" services: @@ -16,4 +19,4 @@ services: HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} HF_HUB_DISABLE_PROGRESS_BARS: 1 HF_HUB_ENABLE_HF_TRANSFER: 0 - command: --model-id ${LLM_MODEL_ID} \ No newline at end of file + command: --model-id ${LLM_MODEL_ID} diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh index 5b13d01d..0cb08d8f 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh @@ -1,4 +1,7 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + export LLM_MODEL_ID="meta-llama/Meta-Llama-3-70B-Instruct" export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} export HF_CACHE_DIR=${HF_CACHE_DIR} -docker compose -f docker-compose-llm-judge-gaudi.yaml up -d \ No newline at end of file +docker compose -f docker-compose-llm-judge-gaudi.yaml up -d diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py index ce961173..c23f6af9 100644 --- a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py +++ b/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py @@ -1,7 +1,11 @@ -from langchain_huggingface import HuggingFaceEndpoint +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os -host_ip=os.environ.get("host_ip", "localhost") +from langchain_huggingface import HuggingFaceEndpoint + +host_ip = os.environ.get("host_ip", "localhost") url = "http://{host_ip}:8085".format(host_ip=host_ip) print(url) diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh index d4529fdb..b8e594e4 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent port=9095 # change this to the port of the agent endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh index 82cae435..ac432787 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh @@ -1,3 +1,6 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + FILEDIR=$WORKDIR/datasets/crag_results/ FILENAME=results.csv LLM_ENDPOINT=http://${host_ip}:8085 diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index d80a3745..06bf96da 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -7,9 +7,9 @@ import os from typing import Dict, Optional, Union -from langchain_huggingface import HuggingFaceEndpoint from langchain_core.embeddings import Embeddings from langchain_core.language_models import BaseLanguageModel +from langchain_huggingface import HuggingFaceEndpoint def format_ragas_metric_name(name: str): @@ -20,22 +20,26 @@ def get_metric(name: str): validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"] if name == "answer_relevancy": from ragas.metrics import answer_relevancy + return answer_relevancy elif name == "faithfulness": from ragas.metrics import faithfulness + return faithfulness elif name == "answer_correctness": from ragas.metrics import answer_correctness + return answer_correctness else: raise ValueError( - "metric should be in supported list {}. ".format(validated_list) - + "ClientResponseError raised with LangchainLLM " - + "when context_precision, context_recall ran. " - + "Here are the related issues described in ragas " - "https://github.com/explodinggradients/ragas/issues/934, " - + "https://github.com/explodinggradients/ragas/issues/664." - ) + "metric should be in supported list {}. ".format(validated_list) + + "ClientResponseError raised with LangchainLLM " + + "when context_precision, context_recall ran. " + + "Here are the related issues described in ragas " + "https://github.com/explodinggradients/ragas/issues/934, " + + "https://github.com/explodinggradients/ragas/issues/664." + ) + class RagasMetric: """This metric checks if the output is more than 3 letters.""" @@ -81,7 +85,7 @@ def __init__( print("OPENAI_API_KEY is provided, ragas initializes the model by OpenAI.") self.model = None if isinstance(self.model, str): - print('LLM endpoint: ', self.model) + print("LLM endpoint: ", self.model) self.chat_model = HuggingFaceEndpoint( endpoint_url=self.model, task="text-generation", @@ -90,7 +94,7 @@ def __init__( ) else: self.chat_model = self.model - + # initialize metrics if self.metrics is not None: tmp_metrics = [] @@ -110,7 +114,7 @@ def __init__( raise ValueError("answer_relevancy metric need provide embeddings model.") tmp_metrics.append(get_metric(metric)) self.metrics = tmp_metrics - else: # default metrics + else: # default metrics self.metrics = [ answer_relevancy, faithfulness, @@ -121,17 +125,17 @@ def __init__( context_recall, ] - async def a_measure(self, test_case: Dict): return self.measure(test_case) def measure(self, test_case: Dict): from ragas import evaluate + try: from datasets import Dataset except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") - + # Create a dataset from the test case # Convert the Dict to a format compatible with Dataset data = { From 90e855bd43071b3ea04ad7dc4a831768817a96d6 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Tue, 27 Aug 2024 15:39:56 -0700 Subject: [PATCH 13/18] update readme and test all commands --- evals/evaluation/crag_eval/README.md | 29 +++++++++++++++---- .../crag_eval/preprocess_data/process_data.py | 5 +++- .../preprocess_data/run_data_preprocess.sh | 6 ++-- .../preprocess_data/run_sample_data.sh | 2 +- .../crag_eval/preprocess_data/sample_data.py | 1 + .../run_benchmark/generate_answers.py | 3 ++ .../run_benchmark/run_generate_answer.sh | 4 +-- .../crag_eval/run_benchmark/run_grading.sh | 4 +-- 8 files changed, 39 insertions(+), 15 deletions(-) diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/crag_eval/README.md index 1e5ce1c2..df7fd772 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/crag_eval/README.md @@ -45,17 +45,22 @@ Data preprocessing directly relates to the quality of retrieval corpus and thus cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data bash run_data_preprocess.sh ``` -3. Optional - Sample queries for benchmark +**Note**: This is an example of data processing. You can develop and optimize your own data processing for this benchmark. +3. Sample queries for benchmark The CRAG dataset has more than 4000 queries, and running all of them can be very expensive and time-consuming. You can sample a subset for benchmark. Here we provide a script to sample up to 5 queries per question_type per dynamism in each domain. For example, we were able to get 92 queries from the music domain using the script. ``` bash run_sample_data.sh ``` ## Launch agent QnA system -Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the AgentQnA example for more details. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark. +Here we showcase a RAG agent in GenAIExample repo. Please refer to the README in the [AgentQnA example](https://github.com/opea-project/GenAIExamples/tree/main/AgentQnA) for more details.
+**Please note**: This is an example. You can build your own agent systems using OPEA components, then expose your own systems as an endpoint for this benchmark.
+To launch the agent in our AgentQnA example, open another terminal and build images and launch agent system there. 1. Build images ``` -git clone +export $WORKDIR= +cd $WORKDIR +git clone https://github.com/opea-project/GenAIExamples.git cd GenAIExamples/AgentQnA/tests/ bash 1_build_images.sh ``` @@ -65,17 +70,27 @@ bash 2_start_retrieval_tool.sh ``` 3. Ingest data into vector database and validate retrieval tool ``` -# Placeholder - may change depending on data -bash 3_ingest_data_and_validate_retrieval.sh +# As an example, we will use the index_data.py script in AgentQnA example. +# You can write your own script to ingest data. +# As an example, We will ingest the docs of the music domain. +# We will use the crag-eval docker container to run the index_data.py script. +# The index_data.py is a client script. +# it will send data-indexing requests to the dataprep server that is part of the retrieval tool. +# So you need to switch back to the terminal where the crag-eval container is running. +cd $WORKDIR/GenAIExamples/AgentQnA/retrieval_tool/ +python3 index_data.py --host_ip $host_ip --filedir ${WORKDIR}/datasets/crag_docs/ --filename crag_docs_music.jsonl ``` -3. Launch and validate agent endpoint +4. Launch and validate agent endpoint ``` +# Go to the terminal where you launched the AgentQnA example +cd $WORKDIR/GenAIExamples/AgentQnA/tests/ bash 4_launch_and_validate_agent.sh ``` ## Run CRAG benchmark Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain. ``` +# Come back to the interactive crag-eval docker container cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark bash run_generate_answer.sh ``` @@ -96,11 +111,13 @@ curl ${host_ip}:8085/generate_stream \ ``` And then go back to the interactive crag-eval docker, run command below. ``` +# Inside the crag-eval container cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/ python3 test_llm_endpoint.py ``` 3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py). ``` +# Inside the crag-eval container cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/ bash run_grading.sh ``` diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py index 2f5dea00..e56df9e1 100644 --- a/evals/evaluation/crag_eval/preprocess_data/process_data.py +++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py @@ -64,7 +64,6 @@ def preprocess_data(input_file): # "domain": data['domain'], # "doc":chunk}) snippet.append({"query": data["query"], "domain": data["domain"], "doc": doc["page_snippet"]}) - print("-----------------------------------") # qa pairs without search results output = {} @@ -73,6 +72,10 @@ def preprocess_data(input_file): output[k] = v return_data.append(output) + n+=1 + if n == 10: + break + return snippet, return_data diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh index d93a5fa1..780f5f29 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 FILEDIR=$WORKDIR/datasets/crag_task_3_dev_v4 -DOCOUT=$WORKDIR/datasets/crag_docs -QAOUT=$WORKDIR/datasets/crag_qas +DOCOUT=$WORKDIR/datasets/crag_docs/ +QAOUT=$WORKDIR/datasets/crag_qas/ -python3 process_data.py --data_dir $FILEDIR --docout $DOCOUT --qaout $QAOUT +python3 process_data.py --filedir $FILEDIR --docout $DOCOUT --qaout $QAOUT diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh index e0e3b0c8..dd104326 100644 --- a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh +++ b/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh @@ -1,6 +1,6 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -FILEDIR=$WORKDIR/datasets/crag_docs +FILEDIR=$WORKDIR/datasets/crag_qas python3 sample_data.py --filedir $FILEDIR diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/crag_eval/preprocess_data/sample_data.py index f4aa8209..51621194 100644 --- a/evals/evaluation/crag_eval/preprocess_data/sample_data.py +++ b/evals/evaluation/crag_eval/preprocess_data/sample_data.py @@ -27,6 +27,7 @@ def sample_data(input_file, output_file): data_files = os.listdir(args.filedir) for file in tqdm.tqdm(data_files): + print(file) file = os.path.join(args.filedir, file) output_file = file.replace(".jsonl", "_sampled.jsonl") sample_data(file, output_file) diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py index bad9b768..06eed7e7 100644 --- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py @@ -51,6 +51,9 @@ def save_as_csv(output): df = get_test_data(args) # df = df.head() # for validation purpose + if not os.path.exists(os.path.dirname(args.output_file)): + os.makedirs(os.path.dirname(args.output_file)) + output_list = [] n = 0 for _, row in df.iterrows(): diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh index b8e594e4..ee863bba 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh @@ -1,14 +1,14 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -host_ip=$(hostname -I | awk '{print $1}') # change this to the IP of the agent +host_ip=$host_ip # change this to the host IP of the agent port=9095 # change this to the port of the agent endpoint=${port}/v1/chat/completions # change this to the endpoint of the agent URL="http://${host_ip}:${endpoint}" echo "AGENT ENDPOINT URL: ${URL}" QUERYFILE=$WORKDIR/datasets/crag_qas/crag_qa_music_sampled.jsonl -OUTPUTFILE=$WORKDIR/datasets/crag_results/results.jsonl +OUTPUTFILE=$WORKDIR/datasets/crag_results/crag_music_sampled_results.jsonl python3 generate_answers.py \ --endpoint_url ${URL} \ diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh index ac432787..5431d39b 100644 --- a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh +++ b/evals/evaluation/crag_eval/run_benchmark/run_grading.sh @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 FILEDIR=$WORKDIR/datasets/crag_results/ -FILENAME=results.csv -LLM_ENDPOINT=http://${host_ip}:8085 +FILENAME=crag_music_sampled_results.csv +LLM_ENDPOINT=http://${host_ip}:8085 # change host_ip to the IP of LLM endpoint python3 grade_answers.py \ --filedir $FILEDIR \ From 153e30fd3370eced47ee50dfdbe490dc0c1a0e6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 22:40:15 +0000 Subject: [PATCH 14/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/evaluation/crag_eval/preprocess_data/process_data.py | 2 +- evals/evaluation/crag_eval/run_benchmark/generate_answers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/crag_eval/preprocess_data/process_data.py index e56df9e1..f8f4bb39 100644 --- a/evals/evaluation/crag_eval/preprocess_data/process_data.py +++ b/evals/evaluation/crag_eval/preprocess_data/process_data.py @@ -72,7 +72,7 @@ def preprocess_data(input_file): output[k] = v return_data.append(output) - n+=1 + n += 1 if n == 10: break diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py index 06eed7e7..19f7f747 100644 --- a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py +++ b/evals/evaluation/crag_eval/run_benchmark/generate_answers.py @@ -53,7 +53,7 @@ def save_as_csv(output): if not os.path.exists(os.path.dirname(args.output_file)): os.makedirs(os.path.dirname(args.output_file)) - + output_list = [] n = 0 for _, row in df.iterrows(): From d51c0832105570888bee4d269c526773c1e81ccc Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 9 Sep 2024 09:47:43 -0700 Subject: [PATCH 15/18] mv crag_eval to agent_eval Signed-off-by: minmin-intel --- .../{ => agent_eval}/crag_eval/README.md | 10 ++-- .../crag_eval/docker/Dockerfile | 0 .../crag_eval/docker/build_image.sh | 0 .../crag_eval/docker/launch_eval_container.sh | 0 .../crag_eval/docker/requirements.txt | 0 .../crag_eval/preprocess_data/process_data.py | 0 .../preprocess_data/run_data_preprocess.sh | 0 .../preprocess_data/run_sample_data.sh | 0 .../crag_eval/preprocess_data/sample_data.py | 0 .../run_benchmark/generate_answers.py | 0 .../crag_eval/run_benchmark/grade_answers.py | 0 .../docker-compose-llm-judge-gaudi.yaml | 0 .../llm_judge/docker-compose-llm-judge.yaml | 0 .../llm_judge/launch_llm_judge_endpoint.sh | 0 .../llm_judge/test_llm_endpoint.py | 0 .../run_benchmark/run_generate_answer.sh | 0 .../crag_eval/run_benchmark/run_grading.sh | 0 evals/metrics/ragas/ragas.py | 53 ++++++++++--------- 18 files changed, 34 insertions(+), 29 deletions(-) rename evals/evaluation/{ => agent_eval}/crag_eval/README.md (94%) rename evals/evaluation/{ => agent_eval}/crag_eval/docker/Dockerfile (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/docker/build_image.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/docker/launch_eval_container.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/docker/requirements.txt (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/process_data.py (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/run_data_preprocess.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/run_sample_data.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/preprocess_data/sample_data.py (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/generate_answers.py (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/grade_answers.py (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/run_generate_answer.sh (100%) rename evals/evaluation/{ => agent_eval}/crag_eval/run_benchmark/run_grading.sh (100%) diff --git a/evals/evaluation/crag_eval/README.md b/evals/evaluation/agent_eval/crag_eval/README.md similarity index 94% rename from evals/evaluation/crag_eval/README.md rename to evals/evaluation/agent_eval/crag_eval/README.md index df7fd772..7b66f8a0 100644 --- a/evals/evaluation/crag_eval/README.md +++ b/evals/evaluation/agent_eval/crag_eval/README.md @@ -11,7 +11,7 @@ git clone https://github.com/opea-project/GenAIEval.git ``` 2. Build docker image ``` -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/docker/ +cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/docker/ bash build_image.sh ``` 3. Set environment vars for downloading models from Huggingface @@ -42,7 +42,7 @@ tar -xf crag_task_3_dev_v4.tar.bz2 2. Preprocess the CRAG data Data preprocessing directly relates to the quality of retrieval corpus and thus can have significant impact on the agent QnA system. Here, we provide one way of preprocessing the data where we simply extracts all the web search snippets as-is from the dataset per domain. We also extract all the query-answer pairs along with other meta data per domain. You can run the command below to use our method. The data processing will take some time to finish. ``` -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/preprocess_data +cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/preprocess_data bash run_data_preprocess.sh ``` **Note**: This is an example of data processing. You can develop and optimize your own data processing for this benchmark. @@ -91,7 +91,7 @@ bash 4_launch_and_validate_agent.sh Once you have your agent system up and running, the next step is to generate answers with agent. Change the variables in the script below and run the script. By default, it will run a sampled set of queries in music domain. ``` # Come back to the interactive crag-eval docker container -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark +cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark bash run_generate_answer.sh ``` @@ -112,12 +112,12 @@ curl ${host_ip}:8085/generate_stream \ And then go back to the interactive crag-eval docker, run command below. ``` # Inside the crag-eval container -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/llm_judge/ +cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/ python3 test_llm_endpoint.py ``` 3. Grade the answer correctness using LLM judge. We use `answer_correctness` metrics from [ragas](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py). ``` # Inside the crag-eval container -cd $WORKDIR/GenAIEval/evals/evaluation/crag_eval/run_benchmark/ +cd $WORKDIR/GenAIEval/evals/evaluation/agent_eval/crag_eval/run_benchmark/ bash run_grading.sh ``` diff --git a/evals/evaluation/crag_eval/docker/Dockerfile b/evals/evaluation/agent_eval/crag_eval/docker/Dockerfile similarity index 100% rename from evals/evaluation/crag_eval/docker/Dockerfile rename to evals/evaluation/agent_eval/crag_eval/docker/Dockerfile diff --git a/evals/evaluation/crag_eval/docker/build_image.sh b/evals/evaluation/agent_eval/crag_eval/docker/build_image.sh similarity index 100% rename from evals/evaluation/crag_eval/docker/build_image.sh rename to evals/evaluation/agent_eval/crag_eval/docker/build_image.sh diff --git a/evals/evaluation/crag_eval/docker/launch_eval_container.sh b/evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh similarity index 100% rename from evals/evaluation/crag_eval/docker/launch_eval_container.sh rename to evals/evaluation/agent_eval/crag_eval/docker/launch_eval_container.sh diff --git a/evals/evaluation/crag_eval/docker/requirements.txt b/evals/evaluation/agent_eval/crag_eval/docker/requirements.txt similarity index 100% rename from evals/evaluation/crag_eval/docker/requirements.txt rename to evals/evaluation/agent_eval/crag_eval/docker/requirements.txt diff --git a/evals/evaluation/crag_eval/preprocess_data/process_data.py b/evals/evaluation/agent_eval/crag_eval/preprocess_data/process_data.py similarity index 100% rename from evals/evaluation/crag_eval/preprocess_data/process_data.py rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/process_data.py diff --git a/evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh b/evals/evaluation/agent_eval/crag_eval/preprocess_data/run_data_preprocess.sh similarity index 100% rename from evals/evaluation/crag_eval/preprocess_data/run_data_preprocess.sh rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/run_data_preprocess.sh diff --git a/evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh b/evals/evaluation/agent_eval/crag_eval/preprocess_data/run_sample_data.sh similarity index 100% rename from evals/evaluation/crag_eval/preprocess_data/run_sample_data.sh rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/run_sample_data.sh diff --git a/evals/evaluation/crag_eval/preprocess_data/sample_data.py b/evals/evaluation/agent_eval/crag_eval/preprocess_data/sample_data.py similarity index 100% rename from evals/evaluation/crag_eval/preprocess_data/sample_data.py rename to evals/evaluation/agent_eval/crag_eval/preprocess_data/sample_data.py diff --git a/evals/evaluation/crag_eval/run_benchmark/generate_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/generate_answers.py similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/generate_answers.py rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/generate_answers.py diff --git a/evals/evaluation/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/grade_answers.py rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge-gaudi.yaml diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/docker-compose-llm-judge.yaml diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/launch_llm_judge_endpoint.sh diff --git a/evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/llm_judge/test_llm_endpoint.py diff --git a/evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/run_generate_answer.sh rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/run_generate_answer.sh diff --git a/evals/evaluation/crag_eval/run_benchmark/run_grading.sh b/evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh similarity index 100% rename from evals/evaluation/crag_eval/run_benchmark/run_grading.sh rename to evals/evaluation/agent_eval/crag_eval/run_benchmark/run_grading.sh diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 06bf96da..ac971364 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -16,29 +16,8 @@ def format_ragas_metric_name(name: str): return f"{name} (ragas)" -def get_metric(name: str): - validated_list = ["answer_relevancy", "faithfulness", "answer_correctness"] - if name == "answer_relevancy": - from ragas.metrics import answer_relevancy - - return answer_relevancy - elif name == "faithfulness": - from ragas.metrics import faithfulness - - return faithfulness - elif name == "answer_correctness": - from ragas.metrics import answer_correctness - - return answer_correctness - else: - raise ValueError( - "metric should be in supported list {}. ".format(validated_list) - + "ClientResponseError raised with LangchainLLM " - + "when context_precision, context_recall ran. " - + "Here are the related issues described in ragas " - "https://github.com/explodinggradients/ragas/issues/934, " - + "https://github.com/explodinggradients/ragas/issues/664." - ) + + class RagasMetric: @@ -112,7 +91,7 @@ def __init__( else: if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") - tmp_metrics.append(get_metric(metric)) + tmp_metrics.append(self.get_metric(metric)) self.metrics = tmp_metrics else: # default metrics self.metrics = [ @@ -125,6 +104,32 @@ def __init__( context_recall, ] + def get_metric(name: str): + if name == "answer_relevancy": + from ragas.metrics import answer_relevancy + return answer_relevancy + elif name == "faithfulness": + from ragas.metrics import faithfulness + return faithfulness + elif name == "answer_correctness": + from ragas.metrics import answer_correctness + return answer_correctness + elif name == "answer_similarity": + from ragas.metrics import answer_similarity + return answer_similarity + elif name == "context_precision": + from ragas.metrics import context_precision + return context_precision + elif name == "context_relevancy": + from ragas.metrics import context_relevancy + return context_relevancy + elif name == "context_recall": + from ragas.metrics import context_recall + return context_recall + else: + raise ValueError(f"The {name} metric has not been validated.") + + async def a_measure(self, test_case: Dict): return self.measure(test_case) From e4cd9c6977e2a679b86a44e6b32086ae2b155e98 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:22:37 +0000 Subject: [PATCH 16/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- evals/metrics/ragas/ragas.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/evals/metrics/ragas/ragas.py b/evals/metrics/ragas/ragas.py index 93ee0bde..35449c08 100644 --- a/evals/metrics/ragas/ragas.py +++ b/evals/metrics/ragas/ragas.py @@ -16,10 +16,6 @@ def format_ragas_metric_name(name: str): return f"{name} (ragas)" - - - - class RagasMetric: """This metric checks if the output is more than 3 letters.""" @@ -64,7 +60,7 @@ def __init__( from datasets import Dataset except ModuleNotFoundError: raise ModuleNotFoundError("Please install dataset") - + self.metrics_instance = { "answer_correctness": answer_correctness, "answer_relevancy": answer_relevancy, @@ -110,9 +106,9 @@ def __init__( if metric == "answer_relevancy" and self.embeddings is None: raise ValueError("answer_relevancy metric need provide embeddings model.") tmp_metrics.append(self.metrics_instance[metric]) - + self.metrics = tmp_metrics - + else: # default metrics self.metrics = [ answer_relevancy, @@ -122,7 +118,6 @@ def __init__( context_precision, context_recall, ] - async def a_measure(self, test_case: Dict): return self.measure(test_case) From 69f87c220bc50041c75cd3967c9d7b17a6b1a4d7 Mon Sep 17 00:00:00 2001 From: minmin-intel Date: Mon, 9 Sep 2024 17:44:03 +0000 Subject: [PATCH 17/18] update test case col names in grade_answer.py Signed-off-by: minmin-intel --- .../crag_eval/run_benchmark/grade_answers.py | 22 +++++++------------ tests/test_ragas.py | 9 ++++---- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py index 094a87aa..8f95d497 100644 --- a/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py +++ b/evals/evaluation/agent_eval/crag_eval/run_benchmark/grade_answers.py @@ -11,30 +11,24 @@ def convert_data_format_for_ragas(data): - # data: pandas dataframe - # columns: ['query', 'answer', 'ref_answer'] - # return: a dict with keys: 'input', 'actual_output', 'expected_output' output = { - "input": data["query"].tolist(), - "actual_output": data["answer"].tolist(), - "expected_output": data["ref_answer"].tolist(), - "retrieval_context": [["dummy_context"] for _ in range(data["query"].shape[0])], + "question": data["query"].tolist(), + "answer": data["answer"].tolist(), + "ground_truth": data["ref_answer"].tolist(), + "contexts": [["dummy_context"] for _ in range(data["query"].shape[0])], } return output def make_list_of_test_cases(data): - # data: pandas dataframe - # columns: ['query', 'answer', 'ref_answer'] - # return: a list of dicts with keys: 'input', 'actual_output', 'expected_output' output = [] for _, row in data.iterrows(): output.append( { - "input": [row["query"]], - "actual_output": [row["answer"]], - "expected_output": [row["ref_answer"]], - "retrieval_context": [["dummy_context"]], + "question": [row["query"]], + "answer": [row["answer"]], + "ground_truth": [row["ref_answer"]], + "contexts": [["dummy_context"]], } ) return output diff --git a/tests/test_ragas.py b/tests/test_ragas.py index e11835ad..eab71800 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -5,13 +5,14 @@ import unittest - +import os from evals.metrics.ragas import RagasMetric - +host_ip = os.getenv("host_ip", "localhost") +port = os.getenv("port", "8008") class TestRagasMetric(unittest.TestCase): - @unittest.skip("need pass localhost id") + # @unittest.skip("need pass localhost id") def test_ragas(self): # Replace this with the actual output from your LLM application actual_output = "We offer a 30-day full refund at no extra cost." @@ -24,7 +25,7 @@ def test_ragas(self): from langchain_community.embeddings import HuggingFaceBgeEmbeddings embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") - metric = RagasMetric(threshold=0.5, model="http://localhost:8008", embeddings=embeddings) + metric = RagasMetric(threshold=0.5, model=f"http://{host_ip}:{port}", embeddings=embeddings) test_case = { "question": ["What if these shoes don't fit?"], "answer": [actual_output], From 82894aa48ca6d97a0e61db5a97765e08ff2504a7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:39:12 +0000 Subject: [PATCH 18/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_ragas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_ragas.py b/tests/test_ragas.py index eab71800..3376b0b5 100644 --- a/tests/test_ragas.py +++ b/tests/test_ragas.py @@ -4,12 +4,15 @@ # SPDX-License-Identifier: Apache-2.0 -import unittest import os +import unittest + from evals.metrics.ragas import RagasMetric host_ip = os.getenv("host_ip", "localhost") port = os.getenv("port", "8008") + + class TestRagasMetric(unittest.TestCase): # @unittest.skip("need pass localhost id")