From cf8bd83f14118530097dc7b5a65030f94b57e841 Mon Sep 17 00:00:00 2001
From: Sihan Chen <39623753+Spycsh@users.noreply.github.com>
Date: Mon, 9 Sep 2024 16:28:16 +0800
Subject: [PATCH] add audioqna asr wer eval scripts (#117)

* add wer eval scripts

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 examples/AudioQnA/README.md        | 48 +++++++++++++++++++++++++
 examples/AudioQnA/local_eval.py    | 35 +++++++++++++++++++
 examples/AudioQnA/online_eval.py   | 56 ++++++++++++++++++++++++++++++
 examples/AudioQnA/requirements.txt |  8 +++++
 4 files changed, 147 insertions(+)
 create mode 100644 examples/AudioQnA/README.md
 create mode 100644 examples/AudioQnA/local_eval.py
 create mode 100644 examples/AudioQnA/online_eval.py
 create mode 100644 examples/AudioQnA/requirements.txt

diff --git a/examples/AudioQnA/README.md b/examples/AudioQnA/README.md
new file mode 100644
index 00000000..918a7997
--- /dev/null
+++ b/examples/AudioQnA/README.md
@@ -0,0 +1,48 @@
+# AudioQnA accuracy Evaluation
+
+## Dataset 
+
+
+We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts.
+
+## Metrics
+
+We evaluate the WER (Word Error Rate) metric of the ASR microservice.
+
+## Evaluation
+
+### Launch ASR microservice
+
+Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr).
+
+```bash
+git clone https://github.com/opea-project/GenAIComps
+cd GenAIComps
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile .
+# change the name of model by editing model_name_or_path you want to evaluate
+docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny"
+```
+
+### Evaluate
+
+Install dependencies:
+
+```
+pip install -r requirements.txt
+```
+
+Evaluate the performance with the LLM:
+```py
+# validate the offline model
+# python offline_evaluate.py
+# validate the online asr microservice accuracy
+python online_evaluate.py
+```
+
+### Performance Result
+Here is the tested result for your reference
+||  WER   |
+| --- |  ----  |
+|whisper-large-v2| 2.87|
+|whisper-large| 2.7	|
+|whisper-medium| 3.45 |
diff --git a/examples/AudioQnA/local_eval.py b/examples/AudioQnA/local_eval.py
new file mode 100644
index 00000000..1ef7b6df
--- /dev/null
+++ b/examples/AudioQnA/local_eval.py
@@ -0,0 +1,35 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+MODEL_NAME = "openai/whisper-large-v2"
+
+librispeech_test_clean = load_dataset(
+    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
+)
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
+
+
+def map_to_pred(batch):
+    audio = batch["audio"]
+    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
+    batch["reference"] = processor.tokenizer._normalize(batch["text"])
+
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features.to(device))[0]
+    transcription = processor.decode(predicted_ids)
+    batch["prediction"] = processor.tokenizer._normalize(transcription)
+    return batch
+
+
+result = librispeech_test_clean.map(map_to_pred)
+
+wer = load("wer")
+print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/examples/AudioQnA/online_eval.py b/examples/AudioQnA/online_eval.py
new file mode 100644
index 00000000..a7854c95
--- /dev/null
+++ b/examples/AudioQnA/online_eval.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+
+import requests
+import torch
+from datasets import load_dataset
+from evaluate import load
+from pydub import AudioSegment
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+
+MODEL_NAME = "openai/whisper-large-v2"
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+
+librispeech_test_clean = load_dataset(
+    "andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True
+)
+
+
+def map_to_pred(batch):
+    batch["reference"] = processor.tokenizer._normalize(batch["text"])
+
+    file_path = batch["file"]
+    # process the file_path
+    pidx = file_path.rfind("/")
+    sidx = file_path.rfind(".")
+
+    file_path_prefix = file_path[: pidx + 1]
+    file_path_suffix = file_path[sidx:]
+    file_path_mid = file_path[pidx + 1 : sidx]
+    splits = file_path_mid.split("-")
+    file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}"
+
+    file_path = file_path_prefix + file_path_mid + file_path_suffix
+
+    audio = AudioSegment.from_file(file_path)
+    audio.export("tmp.wav")
+    with open("tmp.wav", "rb") as f:
+        test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8")
+
+    inputs = {"audio": test_audio_base64_str}
+    endpoint = "http://localhost:7066/v1/asr"
+    response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
+
+    result_str = response.json()["asr_result"]
+
+    batch["prediction"] = processor.tokenizer._normalize(result_str)
+    return batch
+
+
+result = librispeech_test_clean.map(map_to_pred)
+
+wer = load("wer")
+print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
diff --git a/examples/AudioQnA/requirements.txt b/examples/AudioQnA/requirements.txt
new file mode 100644
index 00000000..c3f6c51a
--- /dev/null
+++ b/examples/AudioQnA/requirements.txt
@@ -0,0 +1,8 @@
+datasets
+evaluate
+jiwer
+librosa
+pydub
+soundfile
+torch
+transformers