From cf8bd83f14118530097dc7b5a65030f94b57e841 Mon Sep 17 00:00:00 2001 From: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Date: Mon, 9 Sep 2024 16:28:16 +0800 Subject: [PATCH] add audioqna asr wer eval scripts (#117) * add wer eval scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- examples/AudioQnA/README.md | 48 +++++++++++++++++++++++++ examples/AudioQnA/local_eval.py | 35 +++++++++++++++++++ examples/AudioQnA/online_eval.py | 56 ++++++++++++++++++++++++++++++ examples/AudioQnA/requirements.txt | 8 +++++ 4 files changed, 147 insertions(+) create mode 100644 examples/AudioQnA/README.md create mode 100644 examples/AudioQnA/local_eval.py create mode 100644 examples/AudioQnA/online_eval.py create mode 100644 examples/AudioQnA/requirements.txt diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md new file mode 100644 index 00000000..918a7997 --- /dev/null +++ b/examples/AudioQnA/README.md @@ -0,0 +1,48 @@ +# AudioQnA accuracy Evaluation + +## Dataset + + +We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. + +## Metrics + +We evaluate the WER (Word Error Rate) metric of the ASR microservice. + +## Evaluation + +### Launch ASR microservice + +Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). + +```bash +git clone https://github.com/opea-project/GenAIComps +cd GenAIComps +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +# change the name of model by editing model_name_or_path you want to evaluate +docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" +``` + +### Evaluate + +Install dependencies: + +``` +pip install -r requirements.txt +``` + +Evaluate the performance with the LLM: +```py +# validate the offline model +# python offline_evaluate.py +# validate the online asr microservice accuracy +python online_evaluate.py +``` + +### Performance Result +Here is the tested result for your reference +|| WER | +| --- | ---- | +|whisper-large-v2| 2.87| +|whisper-large| 2.7 | +|whisper-medium| 3.45 | diff --git a/examples/AudioQnA/local_eval.py b/examples/AudioQnA/local_eval.py new file mode 100644 index 00000000..1ef7b6df --- /dev/null +++ b/examples/AudioQnA/local_eval.py @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import torch +from datasets import load_dataset +from evaluate import load +from transformers import WhisperForConditionalGeneration, WhisperProcessor + +device = "cuda" if torch.cuda.is_available() else "cpu" + +MODEL_NAME = "openai/whisper-large-v2" + +librispeech_test_clean = load_dataset( + "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True +) +processor = WhisperProcessor.from_pretrained(MODEL_NAME) +model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) + + +def map_to_pred(batch): + audio = batch["audio"] + input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features + batch["reference"] = processor.tokenizer._normalize(batch["text"]) + + with torch.no_grad(): + predicted_ids = model.generate(input_features.to(device))[0] + transcription = processor.decode(predicted_ids) + batch["prediction"] = processor.tokenizer._normalize(transcription) + return batch + + +result = librispeech_test_clean.map(map_to_pred) + +wer = load("wer") +print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/online_eval.py b/examples/AudioQnA/online_eval.py new file mode 100644 index 00000000..a7854c95 --- /dev/null +++ b/examples/AudioQnA/online_eval.py @@ -0,0 +1,56 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json + +import requests +import torch +from datasets import load_dataset +from evaluate import load +from pydub import AudioSegment +from transformers import WhisperForConditionalGeneration, WhisperProcessor + +MODEL_NAME = "openai/whisper-large-v2" +processor = WhisperProcessor.from_pretrained(MODEL_NAME) + +librispeech_test_clean = load_dataset( + "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True +) + + +def map_to_pred(batch): + batch["reference"] = processor.tokenizer._normalize(batch["text"]) + + file_path = batch["file"] + # process the file_path + pidx = file_path.rfind("/") + sidx = file_path.rfind(".") + + file_path_prefix = file_path[: pidx + 1] + file_path_suffix = file_path[sidx:] + file_path_mid = file_path[pidx + 1 : sidx] + splits = file_path_mid.split("-") + file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}" + + file_path = file_path_prefix + file_path_mid + file_path_suffix + + audio = AudioSegment.from_file(file_path) + audio.export("tmp.wav") + with open("tmp.wav", "rb") as f: + test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") + + inputs = {"audio": test_audio_base64_str} + endpoint = "http://localhost:7066/v1/asr" + response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) + + result_str = response.json()["asr_result"] + + batch["prediction"] = processor.tokenizer._normalize(result_str) + return batch + + +result = librispeech_test_clean.map(map_to_pred) + +wer = load("wer") +print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) diff --git a/examples/AudioQnA/requirements.txt b/examples/AudioQnA/requirements.txt new file mode 100644 index 00000000..c3f6c51a --- /dev/null +++ b/examples/AudioQnA/requirements.txt @@ -0,0 +1,8 @@ +datasets +evaluate +jiwer +librosa +pydub +soundfile +torch +transformers