diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md deleted file mode 100644 index 918a7997..00000000 --- a/examples/AudioQnA/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# AudioQnA accuracy Evaluation - -## Dataset - - -We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. - -## Metrics - -We evaluate the WER (Word Error Rate) metric of the ASR microservice. - -## Evaluation - -### Launch ASR microservice - -Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). - -```bash -git clone https://github.com/opea-project/GenAIComps -cd GenAIComps -docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . -# change the name of model by editing model_name_or_path you want to evaluate -docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" -``` - -### Evaluate - -Install dependencies: - -``` -pip install -r requirements.txt -``` - -Evaluate the performance with the LLM: -```py -# validate the offline model -# python offline_evaluate.py -# validate the online asr microservice accuracy -python online_evaluate.py -``` - -### Performance Result -Here is the tested result for your reference -|| WER | -| --- | ---- | -|whisper-large-v2| 2.87| -|whisper-large| 2.7 | -|whisper-medium| 3.45 | diff --git a/examples/AudioQnA/local_eval.py b/examples/AudioQnA/local_eval.py deleted file mode 100644 index 1ef7b6df..00000000 --- a/examples/AudioQnA/local_eval.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import torch -from datasets import load_dataset -from evaluate import load -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -device = "cuda" if torch.cuda.is_available() else "cpu" - -MODEL_NAME = "openai/whisper-large-v2" - -librispeech_test_clean = load_dataset( - "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True -) -processor = WhisperProcessor.from_pretrained(MODEL_NAME) -model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) - - -def map_to_pred(batch): - audio = batch["audio"] - input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features - batch["reference"] = processor.tokenizer._normalize(batch["text"]) - - with torch.no_grad(): - predicted_ids = model.generate(input_features.to(device))[0] - transcription = processor.decode(predicted_ids) - batch["prediction"] = processor.tokenizer._normalize(transcription) - return batch - - -result = librispeech_test_clean.map(map_to_pred) - -wer = load("wer") -print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/online_eval.py b/examples/AudioQnA/online_eval.py deleted file mode 100644 index a7854c95..00000000 --- a/examples/AudioQnA/online_eval.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import base64 -import json - -import requests -import torch -from datasets import load_dataset -from evaluate import load -from pydub import AudioSegment -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -MODEL_NAME = "openai/whisper-large-v2" -processor = WhisperProcessor.from_pretrained(MODEL_NAME) - -librispeech_test_clean = load_dataset( - "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True -) - - -def map_to_pred(batch): - batch["reference"] = processor.tokenizer._normalize(batch["text"]) - - file_path = batch["file"] - # process the file_path - pidx = file_path.rfind("/") - sidx = file_path.rfind(".") - - file_path_prefix = file_path[: pidx + 1] - file_path_suffix = file_path[sidx:] - file_path_mid = file_path[pidx + 1 : sidx] - splits = file_path_mid.split("-") - file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}" - - file_path = file_path_prefix + file_path_mid + file_path_suffix - - audio = AudioSegment.from_file(file_path) - audio.export("tmp.wav") - with open("tmp.wav", "rb") as f: - test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") - - inputs = {"audio": test_audio_base64_str} - endpoint = "http://localhost:7066/v1/asr" - response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) - - result_str = response.json()["asr_result"] - - batch["prediction"] = processor.tokenizer._normalize(result_str) - return batch - - -result = librispeech_test_clean.map(map_to_pred) - -wer = load("wer") -print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/requirements.txt b/examples/AudioQnA/requirements.txt deleted file mode 100644 index c3f6c51a..00000000 --- a/examples/AudioQnA/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -datasets -evaluate -jiwer -librosa -pydub -soundfile -torch -transformers diff --git a/examples/CodeGen/README.md b/examples/CodeGen/README.md deleted file mode 100644 index 5d118967..00000000 --- a/examples/CodeGen/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# CodeGen accuracy Evaluation - -## Evaluation Framework -We evaluate accuracy by [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness). It is a framework for the evaluation of code generation models. - - -## Evaluation FAQs - -### Launch CodeGen microservice -Please refer to [CodeGen Examples](https://github.com/opea-project/GenAIExamples/tree/main/CodeGen), follow the guide to deploy CodeGen megeservice. - -Use cURL command to test codegen service and ensure that it has started properly -```bash -export CODEGEN_ENDPOINT = "http://${your_ip}:7778/v1/codegen" -curl $CODEGEN_ENDPOINT \ - -H "Content-Type: application/json" \ - -d '{"messages": "Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception."}' - -``` - - -### Generation and Evaluation - -For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available. -#### command line usage - -```shell -cd evals/evaluation/bigcode_evaluation_harness/examples -python main.py --model Qwen/CodeQwen1.5-7B-Chat \ - --tasks humaneval \ - --codegen_url $CODEGEN_ENDPOINT \ - --max_length_generation 2048 \ - --batch_size 1 \ - --save_generations \ - --save_references \ - --allow_code_execution -``` - -***Note:*** Currently, our framework is designed to execute tasks in full. To ensure the accuracy of results, we advise against using the 'limit' or 'limit_start' parameters to restrict the number of test samples. - - -### accuracy Result -Here is the tested result for your reference -```json -{ - "humaneval": { - "pass@1": 0.7195121951219512 - }, - "config": { - "prefix": "", - "do_sample": true, - "temperature": 0.2, - "top_k": 0, - "top_p": 0.95, - "n_samples": 1, - "eos": "<|endoftext|>", - "seed": 0, - "model": "Qwen/CodeQwen1.5-7B-Chat", - "modeltype": "causal", - "peft_model": null, - "revision": null, - "use_auth_token": false, - "trust_remote_code": false, - "tasks": "humaneval", - "instruction_tokens": null, - "batch_size": 1, - "max_length_generation": 2048, - "precision": "fp32", - "load_in_8bit": false, - "load_in_4bit": false, - "left_padding": false, - "limit": null, - "limit_start": 0, - "save_every_k_tasks": -1, - "postprocess": true, - "allow_code_execution": true, - "generation_only": false, - "load_generations_path": null, - "load_data_path": null, - "metric_output_path": "evaluation_results.json", - "save_generations": true, - "load_generations_intermediate_paths": null, - "save_generations_path": "generations.json", - "save_references": true, - "save_references_path": "references.json", - "prompt": "prompt", - "max_memory_per_gpu": null, - "check_references": false, - "codegen_url": "http://192.168.123.104:31234/v1/codegen" - } -} -``` diff --git a/examples/FaqGen/README.md b/examples/FaqGen/README.md deleted file mode 100644 index 70d66744..00000000 --- a/examples/FaqGen/README.md +++ /dev/null @@ -1,63 +0,0 @@ -# FaqGen Performance Evaluation - -## Dataset -We evaluate performance on QA dataset [Squad_v2](https://huggingface.co/datasets/rajpurkar/squad_v2). Generate FAQs on "context" columns in validation dataset, which contains 1204 unique records. - -First download dataset and put at "./data". - -Extract unique "context" columns, which will be save to 'data/sqv2_context.json': -``` -python get_context.py -``` - -## Generate FAQs - -### Launch FaQGen microservice -Please refer to [FaQGen microservice](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/faq-generation/tgi), set up an microservice endpoint. -``` -export FAQ_ENDPOINT = "http://${your_ip}:9000/v1/faqgen" -``` - -### Generate FAQs with microservice -Use the microservice endpoint to generate FAQs for dataset. -``` -python generate_FAQ.py -``` - -Post-process the output to get the right data, which will be save to 'data/sqv2_faq.json'. -``` -python post_process_FAQ.py -``` - -## Evaluate with Ragas - -### Launch TGI service -We use "mistralai/Mixtral-8x7B-Instruct-v0.1" as LLM referee to evaluate the model. First we need to launch a LLM endpoint on Gaudi. -``` -export HUGGING_FACE_HUB_TOKEN="your_huggingface_token" -bash launch_tgi.sh -``` -Get the endpoint: -``` -export LLM_ENDPOINT = "http://${ip_address}:8082" -``` - -Verify the service: -```bash -curl http://${ip_address}:8082/generate \ - -X POST \ - -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":128}}' \ - -H 'Content-Type: application/json' -``` - -### Evaluate -evaluate the performance with the LLM: -``` -python evaluate.py -``` - -### Performance Result -Here is the tested result for your reference -| answer_relevancy | faithfulness | context_utilization | reference_free_rubrics_score | -| ---- | ---- |---- |---- | -| 0.7191 | 0.9681 | 0.8964 | 4.4125| diff --git a/examples/FaqGen/evaluate.py b/examples/FaqGen/evaluate.py deleted file mode 100644 index a082d093..00000000 --- a/examples/FaqGen/evaluate.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -from langchain_community.embeddings import HuggingFaceBgeEmbeddings - -from evals.metrics.ragas import RagasMetric - -llm_endpoint = os.getenv("LLM_ENDPOINT", "http://0.0.0.0:8082") - -f = open("data/sqv2_context.json", "r") -sqv2_context = json.load(f) - -f = open("data/sqv2_faq.json", "r") -sqv2_faq = json.load(f) - -templ = """Create a concise FAQs (frequently asked questions and answers) for following text: - TEXT: {text} - Do not use any prefix or suffix to the FAQ. - """ - -number = 1204 -question = [] -answer = [] -ground_truth = ["None"] * number -contexts = [] -for i in range(number): - inputs = sqv2_context[str(i)] - inputs_faq = templ.format_map({"text": inputs}) - actual_output = sqv2_faq[str(i)] - - question.append(inputs_faq) - answer.append(actual_output) - contexts.append([inputs_faq]) - -embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5") -metrics_faq = ["answer_relevancy", "faithfulness", "context_utilization", "reference_free_rubrics_score"] -metric = RagasMetric(threshold=0.5, model=llm_endpoint, embeddings=embeddings, metrics=metrics_faq) - -test_case = {"question": question, "answer": answer, "ground_truth": ground_truth, "contexts": contexts} - -metric.measure(test_case) -print(metric.score) diff --git a/examples/FaqGen/generate_FAQ.py b/examples/FaqGen/generate_FAQ.py deleted file mode 100644 index 2ed70b9e..00000000 --- a/examples/FaqGen/generate_FAQ.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os -import time - -import requests - -llm_endpoint = os.getenv("FAQ_ENDPOINT", "http://0.0.0.0:9000/v1/faqgen") - -f = open("data/sqv2_context.json", "r") -sqv2_context = json.load(f) - -start_time = time.time() -headers = {"Content-Type": "application/json"} -for i in range(1204): - start_time_tmp = time.time() - print(i) - inputs = sqv2_context[str(i)] - data = {"query": inputs, "max_new_tokens": 128} - response = requests.post(llm_endpoint, json=data, headers=headers) - f = open(f"data/result/sqv2_faq_{i}", "w") - f.write(inputs) - f.write(str(response.content, encoding="utf-8")) - f.close() - print(f"Cost {time.time()-start_time_tmp} seconds") -print(f"\n Finished! \n Totally Cost {time.time()-start_time} seconds\n") diff --git a/examples/FaqGen/get_context.py b/examples/FaqGen/get_context.py deleted file mode 100644 index 8cb73a05..00000000 --- a/examples/FaqGen/get_context.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json -import os - -import pandas as pd - -data_path = "./data" -data = pd.read_parquet(os.path.join(data_path, "squad_v2/squad_v2/validation-00000-of-00001.parquet")) -sq_context = list(data["context"].unique()) -sq_context_d = dict() -for i in range(len(sq_context)): - sq_context_d[i] = sq_context[i] - -with open(os.path.join(data_path, "sqv2_context.json"), "w") as outfile: - json.dump(sq_context_d, outfile) diff --git a/examples/FaqGen/launch_tgi.sh b/examples/FaqGen/launch_tgi.sh deleted file mode 100644 index b3e04bbb..00000000 --- a/examples/FaqGen/launch_tgi.sh +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -max_input_tokens=3072 -max_total_tokens=4096 -port_number=8082 -model_name="mistralai/Mixtral-8x7B-Instruct-v0.1" -volume="./data" -docker run -it --rm \ - --name="tgi_Mixtral" \ - -p $port_number:80 \ - -v $volume:/data \ - --runtime=habana \ - --restart always \ - -e HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN \ - -e HABANA_VISIBLE_DEVICES=all \ - -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ - -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true \ - --cap-add=sys_nice \ - --ipc=host \ - -e HTTPS_PROXY=$https_proxy \ - -e HTTP_PROXY=$https_proxy \ - ghcr.io/huggingface/tgi-gaudi:2.0.1 \ - --model-id $model_name \ - --max-input-tokens $max_input_tokens \ - --max-total-tokens $max_total_tokens \ - --sharded true \ - --num-shard 2 diff --git a/examples/FaqGen/post_process_FAQ.py b/examples/FaqGen/post_process_FAQ.py deleted file mode 100644 index 83e6b835..00000000 --- a/examples/FaqGen/post_process_FAQ.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import json - -faq_dict = {} -fails = [] -for i in range(1204): - data = open(f"data/result/sqv2_faq_{i}", "r").readlines() - result = data[-6][6:] - # print(result) - if "LLMChain/final_output" not in result: - print(f"error1: fail for {i}") - fails.append(i) - continue - try: - result2 = json.loads(result) - result3 = result2["ops"][0]["value"]["text"] - faq_dict[str(i)] = result3 - except: - print(f"error2: fail for {i}") - fails.append(i) - continue -with open("data/sqv2_faq.json", "w") as outfile: - json.dump(faq_dict, outfile) -print("Failure index:") -print(fails)