-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add audioqna asr wer eval scripts (#117)
* add wer eval scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
cff0a36
commit cf8bd83
Showing
4 changed files
with
147 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# AudioQnA accuracy Evaluation | ||
|
||
## Dataset | ||
|
||
|
||
We evaluate the ASR accuracy on the test set of librispeech [dataset](andreagasparini/librispeech_test_only), which contains 2620 records of audio and texts. | ||
|
||
## Metrics | ||
|
||
We evaluate the WER (Word Error Rate) metric of the ASR microservice. | ||
|
||
## Evaluation | ||
|
||
### Launch ASR microservice | ||
|
||
Launch the ASR microserice with the following commands. For more details please refer to [doc](https://github.com/opea-project/GenAIComps/tree/main/comps/asr). | ||
|
||
```bash | ||
git clone https://github.com/opea-project/GenAIComps | ||
cd GenAIComps | ||
docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . | ||
# change the name of model by editing model_name_or_path you want to evaluate | ||
docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest --model_name_or_path "openai/whisper-tiny" | ||
``` | ||
|
||
### Evaluate | ||
|
||
Install dependencies: | ||
|
||
``` | ||
pip install -r requirements.txt | ||
``` | ||
|
||
Evaluate the performance with the LLM: | ||
```py | ||
# validate the offline model | ||
# python offline_evaluate.py | ||
# validate the online asr microservice accuracy | ||
python online_evaluate.py | ||
``` | ||
|
||
### Performance Result | ||
Here is the tested result for your reference | ||
|| WER | | ||
| --- | ---- | | ||
|whisper-large-v2| 2.87| | ||
|whisper-large| 2.7 | | ||
|whisper-medium| 3.45 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import torch | ||
from datasets import load_dataset | ||
from evaluate import load | ||
from transformers import WhisperForConditionalGeneration, WhisperProcessor | ||
|
||
device = "cuda" if torch.cuda.is_available() else "cpu" | ||
|
||
MODEL_NAME = "openai/whisper-large-v2" | ||
|
||
librispeech_test_clean = load_dataset( | ||
"andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True | ||
) | ||
processor = WhisperProcessor.from_pretrained(MODEL_NAME) | ||
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device) | ||
|
||
|
||
def map_to_pred(batch): | ||
audio = batch["audio"] | ||
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features | ||
batch["reference"] = processor.tokenizer._normalize(batch["text"]) | ||
|
||
with torch.no_grad(): | ||
predicted_ids = model.generate(input_features.to(device))[0] | ||
transcription = processor.decode(predicted_ids) | ||
batch["prediction"] = processor.tokenizer._normalize(transcription) | ||
return batch | ||
|
||
|
||
result = librispeech_test_clean.map(map_to_pred) | ||
|
||
wer = load("wer") | ||
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import base64 | ||
import json | ||
|
||
import requests | ||
import torch | ||
from datasets import load_dataset | ||
from evaluate import load | ||
from pydub import AudioSegment | ||
from transformers import WhisperForConditionalGeneration, WhisperProcessor | ||
|
||
MODEL_NAME = "openai/whisper-large-v2" | ||
processor = WhisperProcessor.from_pretrained(MODEL_NAME) | ||
|
||
librispeech_test_clean = load_dataset( | ||
"andreagasparini/librispeech_test_only", "clean", split="test", trust_remote_code=True | ||
) | ||
|
||
|
||
def map_to_pred(batch): | ||
batch["reference"] = processor.tokenizer._normalize(batch["text"]) | ||
|
||
file_path = batch["file"] | ||
# process the file_path | ||
pidx = file_path.rfind("/") | ||
sidx = file_path.rfind(".") | ||
|
||
file_path_prefix = file_path[: pidx + 1] | ||
file_path_suffix = file_path[sidx:] | ||
file_path_mid = file_path[pidx + 1 : sidx] | ||
splits = file_path_mid.split("-") | ||
file_path_mid = f"LibriSpeech/test-clean/{splits[0]}/{splits[1]}/{file_path_mid}" | ||
|
||
file_path = file_path_prefix + file_path_mid + file_path_suffix | ||
|
||
audio = AudioSegment.from_file(file_path) | ||
audio.export("tmp.wav") | ||
with open("tmp.wav", "rb") as f: | ||
test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") | ||
|
||
inputs = {"audio": test_audio_base64_str} | ||
endpoint = "http://localhost:7066/v1/asr" | ||
response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) | ||
|
||
result_str = response.json()["asr_result"] | ||
|
||
batch["prediction"] = processor.tokenizer._normalize(result_str) | ||
return batch | ||
|
||
|
||
result = librispeech_test_clean.map(map_to_pred) | ||
|
||
wer = load("wer") | ||
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
datasets | ||
evaluate | ||
jiwer | ||
librosa | ||
pydub | ||
soundfile | ||
torch | ||
transformers |