From cef6eacf22bbd5ab8339479bfc1c749c63d23710 Mon Sep 17 00:00:00 2001 From: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Date: Fri, 21 Jun 2024 11:21:07 +0800 Subject: [PATCH] Add asr/tts components for xeon and hpu (#222) * add asr/tts component for xeon and hpu Signed-off-by: Spycsh * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * fix ffmpeg JSONDecode error on HPU * add tests * trigger * try --------- Signed-off-by: Spycsh Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- comps/asr/Dockerfile | 4 - comps/asr/README.md | 78 +++++++++++++-- comps/asr/asr.py | 106 +++++--------------- comps/asr/check_asr_server.py | 31 ++++++ comps/asr/requirements.txt | 3 +- comps/asr/whisper/Dockerfile | 23 +++++ comps/asr/whisper/Dockerfile_hpu | 26 +++++ comps/asr/whisper/__init__.py | 2 + comps/asr/whisper/check_whisper_server.py | 31 ++++++ comps/asr/whisper/whisper_model.py | 115 ++++++++++++++++++++++ comps/asr/whisper/whisper_server.py | 64 ++++++++++++ comps/lvms/README.md | 16 +-- comps/tts/Dockerfile | 10 -- comps/tts/README.md | 76 +++++++++++--- comps/tts/requirements.txt | 2 + comps/tts/speecht5/Dockerfile | 24 +++++ comps/tts/speecht5/Dockerfile_hpu | 28 ++++++ comps/tts/speecht5/__init__.py | 2 + comps/tts/speecht5/speecht5_model.py | 103 +++++++++++++++++++ comps/tts/speecht5/speecht5_server.py | 50 ++++++++++ comps/tts/tts.py | 114 ++++----------------- tests/test_asr_whisper.sh | 54 ++++++++++ tests/test_tts_speecht5.sh | 54 ++++++++++ 23 files changed, 792 insertions(+), 224 deletions(-) create mode 100644 comps/asr/check_asr_server.py create mode 100644 comps/asr/whisper/Dockerfile create mode 100644 comps/asr/whisper/Dockerfile_hpu create mode 100644 comps/asr/whisper/__init__.py create mode 100644 comps/asr/whisper/check_whisper_server.py create mode 100644 comps/asr/whisper/whisper_model.py create mode 100644 comps/asr/whisper/whisper_server.py create mode 100644 comps/tts/speecht5/Dockerfile create mode 100644 comps/tts/speecht5/Dockerfile_hpu create mode 100644 comps/tts/speecht5/__init__.py create mode 100644 comps/tts/speecht5/speecht5_model.py create mode 100644 comps/tts/speecht5/speecht5_server.py create mode 100644 tests/test_asr_whisper.sh create mode 100644 tests/test_tts_speecht5.sh diff --git a/comps/asr/Dockerfile b/comps/asr/Dockerfile index 8b11e6389..fd1db5806 100644 --- a/comps/asr/Dockerfile +++ b/comps/asr/Dockerfile @@ -5,10 +5,6 @@ FROM python:3.11-slim ENV LANG C.UTF-8 -# Install system dependencies -RUN apt-get update \ - && apt-get install -y ffmpeg - COPY comps /home/comps RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/comps/asr/README.md b/comps/asr/README.md index 295683c79..f5a197be1 100644 --- a/comps/asr/README.md +++ b/comps/asr/README.md @@ -12,35 +12,93 @@ To start the ASR microservice with Python, you need to first install python pack pip install -r requirements.txt ``` -## 1.2 Start ASR Service with Python Script +## 1.2 Start Whisper Service/Test + +- Xeon CPU + +```bash +cd whisper/ +nohup python whisper_server.py --device=cpu & +python check_whisper_server.py +``` + +- Gaudi2 HPU + +```bash +pip install optimum[habana] + +cd whisper/ +nohup python whisper_server.py --device=hpu & +python check_whisper_server.py +``` + +## 1.3 Start ASR Service/Test ```bash python asr.py +python check_asr_server.py ``` # 🚀2. Start Microservice with Docker (Option 2) Alternatively, you can also start the ASR microservice with Docker. -## 2.1 Build Docker Image +## 2.1 Build Images + +### 2.1.1 Whisper Server Image + +- Xeon CPU + +```bash +cd ../.. +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +``` + +- Gaudi2 HPU + +```bash +cd ../.. +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile_hpu . +``` + +### 2.1.2 ASR Service Image ```bash -cd ../../ docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/Dockerfile . ``` -## 2.2 Run Docker with CLI +## 2.2 Start Whisper and ASR Service + +### 2.2.1 Start Whisper Server + +- Xeon + +```bash +docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest +``` + +- Gaudi2 HPU ```bash -docker run -p 9099:9099 --network=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/asr:latest +docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest ``` -# 🚀3. Consume ASR Service +### 2.2.2 Start ASR service + +```bash +ip_address=$(hostname -I | awk '{print $1}') + +docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ASR_ENDPOINT=http://$ip_address:7066 opea/asr:latest +``` -You can use the following `curl` command to test whether the service is up. Notice that the first request can be slow because it needs to download the models. +### 2.2.3 Test ```bash -curl http://localhost:9099/v1/audio/transcriptions \ - -H "Content-Type: application/json" \ - -d '{"url": "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav"}' +# Use curl or python + +# curl +http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json' + +# python +python check_asr_server.py ``` diff --git a/comps/asr/asr.py b/comps/asr/asr.py index 2acccdc7d..97fbb0bb0 100644 --- a/comps/asr/asr.py +++ b/comps/asr/asr.py @@ -1,78 +1,22 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import contextlib +import json import os import time import numpy as np -import torch -from datasets import Audio, Dataset -from pydub import AudioSegment -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -from comps import Audio2TextDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice - - -@opea_telemetry -def _audiosegment_to_librosawav(audiosegment): - channel_sounds = audiosegment.split_to_mono()[:1] # only select the first channel - samples = [s.get_array_of_samples() for s in channel_sounds] - - fp_arr = np.array(samples).T.astype(np.float32) - fp_arr /= np.iinfo(samples[0].typecode).max - fp_arr = fp_arr.reshape(-1) - - return fp_arr - - -@opea_telemetry -def audio2text( - audio_path, - model_name_or_path="openai/whisper-small", - language=None, - bf16=False, - device="cpu", -): - """Convert audio to text.""" - start = time.time() - model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(device) - processor = WhisperProcessor.from_pretrained(model_name_or_path) - model.eval() - bf16 = bf16 - if bf16: - import intel_extension_for_pytorch as ipex - - model = ipex.optimize(model, dtype=torch.bfloat16) - language = language - - try: - waveform = AudioSegment.from_file(audio_path).set_frame_rate(16000) - waveform = _audiosegment_to_librosawav(waveform) - except Exception as e: - print(f"[ASR] audiosegment to librosa wave fail: {e}") - audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000)) - waveform = audio_dataset[0]["audio"]["array"] - - inputs = processor.feature_extractor(waveform, return_tensors="pt", sampling_rate=16_000).input_features.to(device) - with torch.cpu.amp.autocast() if bf16 else contextlib.nullcontext(): - if language is None: - predicted_ids = model.generate(inputs) - elif language == "auto": - model.config.forced_decoder_ids = None - predicted_ids = model.generate(inputs) - else: - forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") - model.config.forced_decoder_ids = forced_decoder_ids - predicted_ids = model.generate(inputs) - - result = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0] - if language == "auto" or language == "zh": - from zhconv import convert - - result = convert(result, "zh-cn") - print(f"generated text in {time.time() - start} seconds, and the result is: {result}") - return result +import requests + +from comps import ( + Base64ByteStrDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) @register_microservice( @@ -81,26 +25,22 @@ def audio2text( endpoint="/v1/audio/transcriptions", host="0.0.0.0", port=9099, - input_datatype=Audio2TextDoc, + input_datatype=Base64ByteStrDoc, output_datatype=TextDoc, ) -@opea_telemetry -async def audio_to_text(audio: Audio2TextDoc): - audio.tensor, audio.frame_rate = audio.url.load() # AudioNdArray, fr - audio_path = f"{audio.id}.wav" - audio.tensor.save(audio_path, frame_rate=16000) +@register_statistics(names=["opea_service@asr"]) +async def audio_to_text(audio: Base64ByteStrDoc): + start = time.time() + byte_str = audio.byte_str + inputs = {"audio": byte_str} + + response = requests.post(url=f"{asr_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None}) - try: - asr_result = audio2text(audio_path, model_name_or_path=audio.model_name_or_path, language=audio.language) - except Exception as e: - print(e) - asr_result = e - finally: - os.remove(audio_path) - res = TextDoc(text=asr_result) - return res + statistics_dict["opea_service@asr"].append_latency(time.time() - start, None) + return TextDoc(text=response.json()["asr_result"]) if __name__ == "__main__": + asr_endpoint = os.getenv("ASR_ENDPOINT", "http://localhost:7066") print("[asr - router] ASR initialized.") opea_microservices["opea_service@asr"].start() diff --git a/comps/asr/check_asr_server.py b/comps/asr/check_asr_server.py new file mode 100644 index 000000000..d51bc8569 --- /dev/null +++ b/comps/asr/check_asr_server.py @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +import os +import urllib.request +import uuid +from io import BytesIO + +import requests + +# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa +# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" + +uid = str(uuid.uuid4()) +file_name = uid + ".wav" + +urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + file_name, +) + +with open(file_name, "rb") as f: + test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") +os.remove(file_name) + +endpoint = "http://localhost:9099/v1/audio/transcriptions" +inputs = {"byte_str": test_audio_base64_str} +response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) +print(response.json()) diff --git a/comps/asr/requirements.txt b/comps/asr/requirements.txt index aee4f7603..3c0c8be74 100644 --- a/comps/asr/requirements.txt +++ b/comps/asr/requirements.txt @@ -1,10 +1,11 @@ datasets docarray[full] fastapi -intel_extension_for_pytorch opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +optimum[habana] +pydantic==2.7.2 pydub shortuuid torch diff --git a/comps/asr/whisper/Dockerfile b/comps/asr/whisper/Dockerfile new file mode 100644 index 000000000..c3e2a0025 --- /dev/null +++ b/comps/asr/whisper/Dockerfile @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg + +COPY comps /home/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/asr/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/asr/whisper + +ENTRYPOINT ["python", "whisper_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu new file mode 100644 index 000000000..0af3b1e82 --- /dev/null +++ b/comps/asr/whisper/Dockerfile_hpu @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg + +COPY comps /home/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/asr/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/asr/whisper + +ENTRYPOINT ["python", "whisper_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/asr/whisper/__init__.py b/comps/asr/whisper/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/asr/whisper/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/asr/whisper/check_whisper_server.py b/comps/asr/whisper/check_whisper_server.py new file mode 100644 index 000000000..1b338d08d --- /dev/null +++ b/comps/asr/whisper/check_whisper_server.py @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +import os +import urllib.request +import uuid +from io import BytesIO + +import requests + +# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa +# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" + +uid = str(uuid.uuid4()) +file_name = uid + ".wav" + +urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + file_name, +) + +with open(file_name, "rb") as f: + test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") +os.remove(file_name) + +endpoint = "http://localhost:7066/v1/asr" +inputs = {"audio": test_audio_base64_str} +response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) +print(response.json()) diff --git a/comps/asr/whisper/whisper_model.py b/comps/asr/whisper/whisper_model.py new file mode 100644 index 000000000..0af9ebfcb --- /dev/null +++ b/comps/asr/whisper/whisper_model.py @@ -0,0 +1,115 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import contextlib +import os +import time +import urllib.request + +import numpy as np +import torch +from datasets import Audio, Dataset +from pydub import AudioSegment +from transformers import WhisperForConditionalGeneration, WhisperProcessor + + +class WhisperModel: + """Convert audio to text.""" + + def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu"): + if device == "hpu": + # Explicitly link HPU with Torch + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + self.device = device + asr_model_name_or_path = os.environ.get("ASR_MODEL_PATH", model_name_or_path) + print("Downloading model: {}".format(asr_model_name_or_path)) + self.model = WhisperForConditionalGeneration.from_pretrained(asr_model_name_or_path).to(self.device) + self.processor = WhisperProcessor.from_pretrained(asr_model_name_or_path) + self.model.eval() + + self.language = language + + if device == "hpu": + # do hpu graph warmup with a long enough input audio + # whisper has a receptive field of 30 seconds + # here we select a relatively long audio (~15 sec) to quickly warmup + self._warmup_whisper_hpu_graph("https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav") + + def _audiosegment_to_librosawav(self, audiosegment): + # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples + # This way is faster than librosa.load or HuggingFace Dataset wrapper + channel_sounds = audiosegment.split_to_mono()[:1] # only select the first channel + samples = [s.get_array_of_samples() for s in channel_sounds] + + fp_arr = np.array(samples).T.astype(np.float32) + fp_arr /= np.iinfo(samples[0].typecode).max + fp_arr = fp_arr.reshape(-1) + + return fp_arr + + def _warmup_whisper_hpu_graph(self, url): + print("[ASR] fetch warmup audio...") + urllib.request.urlretrieve( + url, + "warmup.wav", + ) + print("[ASR] warmup...") + waveform = AudioSegment.from_file("warmup.wav").set_frame_rate(16000) + waveform = self._audiosegment_to_librosawav(waveform) + # pylint: disable=E1101 + inputs = self.processor.feature_extractor( + waveform, return_tensors="pt", sampling_rate=16_000 + ).input_features.to(self.device) + _ = self.model.generate(inputs, language="chinese") + + def audio2text(self, audio_path): + """Convert audio to text. + + audio_path: the path to the input audio, e.g. ~/xxx.mp3 + """ + start = time.time() + + try: + waveform = AudioSegment.from_file(audio_path).set_frame_rate(16000) + waveform = self._audiosegment_to_librosawav(waveform) + except Exception as e: + print(f"[ASR] audiosegment to librosa wave fail: {e}") + audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000)) + waveform = audio_dataset[0]["audio"]["array"] + + # pylint: disable=E1101 + inputs = self.processor.feature_extractor( + waveform, return_tensors="pt", sampling_rate=16_000 + ).input_features.to(self.device) + predicted_ids = self.model.generate(inputs, language=self.language) + # pylint: disable=E1101 + result = self.processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0] + if self.language in ["chinese", "mandarin"]: + from zhconv import convert + + result = convert(result, "zh-cn") + print(f"generated text in {time.time() - start} seconds, and the result is: {result}") + return result + + +if __name__ == "__main__": + asr = WhisperModel(language="english") + + # Test multilanguage asr + urllib.request.urlretrieve( + "https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav", + "sample.wav", + ) + asr.language = "chinese" + text = asr.audio2text("sample.wav") + + urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + "sample.wav", + ) + text = asr.audio2text("sample.wav") + + os.remove("sample.wav") diff --git a/comps/asr/whisper/whisper_server.py b/comps/asr/whisper/whisper_server.py new file mode 100644 index 000000000..1a5c760d2 --- /dev/null +++ b/comps/asr/whisper/whisper_server.py @@ -0,0 +1,64 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import base64 +import os +import uuid + +import uvicorn +from fastapi import FastAPI, Request +from fastapi.responses import Response +from pydub import AudioSegment +from starlette.middleware.cors import CORSMiddleware +from whisper_model import WhisperModel + +app = FastAPI() +asr = None + +app.add_middleware( + CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] +) + + +@app.get("/v1/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/v1/asr") +async def audio_to_text(request: Request): + print("Whisper generation begin.") + uid = str(uuid.uuid4()) + file_name = uid + ".wav" + request_dict = await request.json() + audio_b64_str = request_dict.pop("audio") + with open(file_name, "wb") as f: + f.write(base64.b64decode(audio_b64_str)) + + audio = AudioSegment.from_file(file_name) + audio = audio.set_frame_rate(16000) + audio.export(f"{file_name}", format="wav") + try: + asr_result = asr.audio2text(file_name) + except Exception as e: + print(e) + asr_result = e + finally: + os.remove(file_name) + return {"asr_result": asr_result} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=7066) + parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small") + parser.add_argument("--language", type=str, default="english") + parser.add_argument("--device", type=str, default="cpu") + + args = parser.parse_args() + asr = WhisperModel(model_name_or_path=args.model_name_or_path, language=args.language, device=args.device) + + uvicorn.run(app, host=args.host, port=args.port) diff --git a/comps/lvms/README.md b/comps/lvms/README.md index 67854885f..8f8237180 100644 --- a/comps/lvms/README.md +++ b/comps/lvms/README.md @@ -47,11 +47,11 @@ python lvm.py python check_lvm.py ``` -# 🚀1. Start Microservice with Docker (Option 2) +# 🚀2. Start Microservice with Docker (Option 2) -## 1.2 Build Images +## 2.1 Build Images -### 1.2.1 LLaVA Server Image +### 2.1.1 LLaVA Server Image - Xeon CPU @@ -67,16 +67,16 @@ cd ../.. docker build -t opea/llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llava/Dockerfile_hpu . ``` -### 1.2.2 LVM Service Image +### 2.1.2 LVM Service Image ```bash cd ../.. docker build -t opea/lvm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/Dockerfile . ``` -## 1.3 Start LLaVA and LVM Service +## 2.2 Start LLaVA and LVM Service -### 1.3.1 Start LLaVA server +### 2.2.1 Start LLaVA server - Xeon @@ -90,7 +90,7 @@ docker run -p 8399:8399 -e http_proxy=$http_proxy --ipc=host -e https_proxy=$htt docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llava:latest ``` -### 1.3.2 Start LVM service +### 2.2.2 Start LVM service ```bash ip_address=$(hostname -I | awk '{print $1}') @@ -98,7 +98,7 @@ ip_address=$(hostname -I | awk '{print $1}') docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm:latest ``` -### 1.3.3 Test +### 2.2.3 Test ```bash # Use curl/python diff --git a/comps/tts/Dockerfile b/comps/tts/Dockerfile index 3ad17b8f8..50dd9e15f 100644 --- a/comps/tts/Dockerfile +++ b/comps/tts/Dockerfile @@ -5,21 +5,11 @@ FROM python:3.11-slim ENV LANG C.UTF-8 -# Install system dependencies -RUN apt-get update \ - && apt-get install -y ffmpeg \ - && apt-get install -y curl \ - && apt-get install -y libomp-dev google-perftools - COPY comps /home/comps RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r /home/comps/tts/requirements.txt -ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libiomp5.so:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 -ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" -ENV OMP_NUM_THREADS=56 - ENV PYTHONPATH=$PYTHONPATH:/home WORKDIR /home/comps/tts diff --git a/comps/tts/README.md b/comps/tts/README.md index b0aa32456..a26e72200 100644 --- a/comps/tts/README.md +++ b/comps/tts/README.md @@ -2,45 +2,91 @@ TTS (Text-To-Speech) microservice helps users convert text to speech. When building a talking bot with LLM, users might need an LLM generated answer in audio format. This microservice is built for that conversion stage. -# 🚀1. Start Microservice with Python (Option 1) +## 1.2 Start SpeechT5 Service/Test -To start the TTS microservice, you need to first install python packages. +- Xeon CPU -## 1.1 Install Requirements +```bash +cd speecht5/ +nohup python speecht5_server.py --device=cpu & +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' +``` + +- Gaudi2 HPU ```bash -pip install -r requirements.txt +pip install optimum[habana] + +cd speecht5/ +nohup python speecht5_server.py --device=hpu & +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` -## 1.2 Start TTS Service with Python Script +## 1.3 Start TTS Service/Test ```bash python tts.py + +curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` # 🚀2. Start Microservice with Docker (Option 2) -Alternatively, you can start the ASR microservice with Docker. +Alternatively, you can start the TTS microservice with Docker. + +## 2.1 Build Images + +### 2.1.1 Whisper Server Image + +- Xeon CPU + +```bash +cd ../.. +docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile . +``` + +- Gaudi2 HPU + +```bash +cd ../.. +docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile_hpu . +``` -## 2.1 Build Docker Image +### 2.1.2 TTS Service Image ```bash -cd ../../ docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/Dockerfile . ``` -## 2.2 Run Docker with CLI +## 2.2 Start SpeechT5 and TTS Service + +### 2.2.1 Start SpeechT5 Server + +- Xeon ```bash -docker run -p 9999:9999 --network=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/tts:latest +docker run -p 7055:7055 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/speecht5:latest ``` -# 🚀3. Consume TTS Service +- Gaudi2 HPU + +```bash +docker run -p 7055:7055 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/speecht5:latest +``` -You can use the following `curl` command to test whether the service is up. Notice that the first request can be slow because it needs to download the models. +### 2.2.2 Start TTS service ```bash -curl http://localhost:9999/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{"text":"Hello there."}' +ip_address=$(hostname -I | awk '{print $1}') + +docker run -p 9088:9088 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TTS_ENDPOINT=http://$ip_address:7055 opea/tts:latest +``` + +### 2.2.3 Test + +```bash +# curl +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' + +curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` diff --git a/comps/tts/requirements.txt b/comps/tts/requirements.txt index 68f2c0128..7baa60a17 100644 --- a/comps/tts/requirements.txt +++ b/comps/tts/requirements.txt @@ -1,8 +1,10 @@ +aiohttp docarray[full] fastapi opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +optimum[habana] sentencepiece shortuuid torch diff --git a/comps/tts/speecht5/Dockerfile b/comps/tts/speecht5/Dockerfile new file mode 100644 index 000000000..e4afd07db --- /dev/null +++ b/comps/tts/speecht5/Dockerfile @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg \ + && apt-get install -y curl + +COPY comps /home/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/tts/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/tts/speecht5 + +ENTRYPOINT ["python", "speecht5_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/tts/speecht5/Dockerfile_hpu b/comps/tts/speecht5/Dockerfile_hpu new file mode 100644 index 000000000..3b27926c3 --- /dev/null +++ b/comps/tts/speecht5/Dockerfile_hpu @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu +RUN rm -rf /etc/ssh/ssh_host* + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg \ + && apt-get install -y curl + +COPY comps /home/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/tts/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/tts/speecht5 + +ENTRYPOINT ["python", "speecht5_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/tts/speecht5/__init__.py b/comps/tts/speecht5/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/tts/speecht5/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/tts/speecht5/speecht5_model.py b/comps/tts/speecht5/speecht5_model.py new file mode 100644 index 000000000..3c2bbe68a --- /dev/null +++ b/comps/tts/speecht5/speecht5_model.py @@ -0,0 +1,103 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import subprocess + +import numpy as np +import torch +from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor + + +class SpeechT5Model: + def __init__(self, device="cpu"): + self.device = device + if self.device == "hpu": + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + model_name_or_path = "microsoft/speecht5_tts" + vocoder_model_name_or_path = "microsoft/speecht5_hifigan" + self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name_or_path).to(device) + self.model.eval() + self.processor = SpeechT5Processor.from_pretrained(model_name_or_path, normalize=True) + self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device) + self.vocoder.eval() + + # fetch default speaker embedding + if os.path.exists("spk_embed_default.pt"): + self.default_speaker_embedding = torch.load("spk_embed_default.pt") + else: + try: + p = subprocess.Popen( + [ + "curl", + "-O", + "https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/" + "intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/" + "spk_embed_default.pt", + ] + ) + p.wait() + self.default_speaker_embedding = torch.load("spk_embed_default.pt") + except Exception as e: + print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.") + self.default_speaker_embedding = torch.zeros((1, 512)) + + if self.device == "hpu": + # do hpu graph warmup with variable inputs + self._warmup_speecht5_hpu_graph() + + def split_long_text_into_batch(self, text, batch_length=128): + """Batch the long text into sequences of shorter sentences.""" + res = [] + hitted_ends = [",", ".", "?", "!", "。", ";", " "] + idx = 0 + cur_start = 0 + cur_end = -1 + while idx < len(text): + if idx - cur_start > batch_length: + if cur_end != -1 and cur_end > cur_start: + res.append(text[cur_start : cur_end + 1]) + else: + cur_end = cur_start + batch_length - 1 + res.append(text[cur_start : cur_end + 1]) + idx = cur_end + cur_start = cur_end + 1 + if text[idx] in hitted_ends: + cur_end = idx + idx += 1 + # deal with the last sequence + res.append(text[cur_start:idx]) + res = [i + "." for i in res] # avoid unexpected end of sequence + return res + + def _warmup_speecht5_hpu_graph(self): + self.t2s("Hello, how can I help you today?") + self.t2s("OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies.") + self.t2s( + "OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies & workflows leading to quicker GenAI adoption and business value." + ) + + def t2s(self, text): + if self.device == "hpu": + # See https://github.com/huggingface/optimum-habana/pull/824 + from optimum.habana.utils import set_seed + + set_seed(555) + all_speech = np.array([]) + text = self.split_long_text_into_batch(text, batch_length=100) + inputs = self.processor(text=text, padding=True, max_length=128, return_tensors="pt") + with torch.no_grad(): + waveforms, waveform_lengths = self.model.generate_speech( + inputs["input_ids"].to(self.device), + speaker_embeddings=self.default_speaker_embedding.to(self.device), + attention_mask=inputs["attention_mask"].to(self.device), + vocoder=self.vocoder, + return_output_lengths=True, + ) + for i in range(waveforms.size(0)): + all_speech = np.concatenate([all_speech, waveforms[i][: waveform_lengths[i]].cpu().numpy()]) + + return all_speech diff --git a/comps/tts/speecht5/speecht5_server.py b/comps/tts/speecht5/speecht5_server.py new file mode 100644 index 000000000..467ee6b44 --- /dev/null +++ b/comps/tts/speecht5/speecht5_server.py @@ -0,0 +1,50 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import base64 +from io import BytesIO + +import uvicorn +from fastapi import FastAPI, Request +from fastapi.responses import Response +from speecht5_model import SpeechT5Model +from starlette.middleware.cors import CORSMiddleware + +app = FastAPI() +tts = None + +app.add_middleware( + CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] +) + + +@app.get("/v1/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/v1/tts") +async def text_to_speech(request: Request): + print("SpeechT5 generation begin.") + request_dict = await request.json() + text = request_dict.pop("text") + + speech = tts.t2s(text) + + buffered = BytesIO() + buffered.write(speech.tobytes()) + return {"tts_result": base64.b64encode(buffered.getvalue())} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=7055) + parser.add_argument("--device", type=str, default="cpu") + + args = parser.parse_args() + tts = SpeechT5Model(device=args.device) + + uvicorn.run(app, host=args.host, port=args.port) diff --git a/comps/tts/tts.py b/comps/tts/tts.py index 11891cd04..6c6bad232 100644 --- a/comps/tts/tts.py +++ b/comps/tts/tts.py @@ -1,96 +1,21 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import base64 +import json import os import time -from io import BytesIO -import numpy as np -import torch -from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor +import requests -from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice - - -@opea_telemetry -def split_long_text_into_batch(text, batch_length=128): - """Batch the long text into sequences of shorter sentences.""" - res = [] - hitted_ends = [",", ".", "?", "!", "。", ";", " "] - idx = 0 - cur_start = 0 - cur_end = -1 - while idx < len(text): - if idx - cur_start > batch_length: - if cur_end != -1 and cur_end > cur_start: - res.append(text[cur_start : cur_end + 1]) - else: - cur_end = cur_start + batch_length - 1 - res.append(text[cur_start : cur_end + 1]) - idx = cur_end - cur_start = cur_end + 1 - if text[idx] in hitted_ends: - cur_end = idx - idx += 1 - # deal with the last sequence - res.append(text[cur_start:idx]) - res = [i + "." for i in res] # avoid unexpected end of sequence - return res - - -@opea_telemetry -def text2speech( - text, - model_name_or_path="microsoft/speecht5_tts", - vocoder_model_name_or_path="microsoft/speecht5_hifigan", - output_audio_path="./response.wav", - device="cpu", -): - start = time.time() - model = SpeechT5ForTextToSpeech.from_pretrained(model_name_or_path).to(device) - model.eval() - processor = SpeechT5Processor.from_pretrained(model_name_or_path, normalize=True) - vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device) - vocoder.eval() - - if os.path.exists("spk_embed_default.pt"): - default_speaker_embedding = torch.load("spk_embed_default.pt") - else: # pragma: no cover - import subprocess - - try: - p = subprocess.Popen( - [ - "curl", - "-O", - "https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/" - "intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/" - "spk_embed_default.pt", - ] - ) - p.wait() - default_speaker_embedding = torch.load("spk_embed_default.pt") - except Exception as e: - print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.") - default_speaker_embedding = torch.zeros((1, 512)) - - all_speech = np.array([]) - text = split_long_text_into_batch(text, batch_length=100) - inputs = processor(text=text, padding=True, max_length=128, return_tensors="pt") - with torch.no_grad(): - waveforms, waveform_lengths = model.generate_speech( - inputs["input_ids"].to(device), - speaker_embeddings=default_speaker_embedding.to(device), - attention_mask=inputs["attention_mask"].to(device), - vocoder=vocoder, - return_output_lengths=True, - ) - for i in range(waveforms.size(0)): - all_speech = np.concatenate([all_speech, waveforms[i][: waveform_lengths[i]].cpu().numpy()]) - - print(f"generated speech in {time.time() - start} seconds") - return all_speech +from comps import ( + Base64ByteStrDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) @register_microservice( @@ -98,19 +23,22 @@ def text2speech( service_type=ServiceType.TTS, endpoint="/v1/audio/speech", host="0.0.0.0", - port=9999, + port=9088, input_datatype=TextDoc, - output_datatype=TextDoc, + output_datatype=Base64ByteStrDoc, ) -@opea_telemetry +@register_statistics(names=["opea_service@tts"]) async def text_to_audio(input: TextDoc): + start = time.time() text = input.text - speech = text2speech(text=text) - buffered = BytesIO() - buffered.write(speech.tobytes()) - return Base64ByteStrDoc(byte_str=base64.b64encode(buffered.getvalue())) + inputs = {"text": text} + + response = requests.post(url=f"{tts_endpoint}/v1/tts", data=json.dumps(inputs), proxies={"http": None}) + statistics_dict["opea_service@tts"].append_latency(time.time() - start, None) + return Base64ByteStrDoc(byte_str=response.json()["tts_result"]) if __name__ == "__main__": + tts_endpoint = os.getenv("TTS_ENDPOINT", "http://localhost:7055") print("[tts - router] TTS initialized.") opea_microservices["opea_service@tts"].start() diff --git a/tests/test_asr_whisper.sh b/tests/test_asr_whisper.sh new file mode 100644 index 000000000..5e6e4a8c8 --- /dev/null +++ b/tests/test_asr_whisper.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/whisper:latest -f comps/asr/whisper/Dockerfile . + docker build -t opea/asr:latest -f comps/asr/Dockerfile . +} + +function start_service() { + unset http_proxy + docker run -d --name="test-comps-asr-whisper" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 7066:7066 --ipc=host opea/whisper:latest + docker run -d --name="test-comps-asr" -e ASR_ENDPOINT=http://$ip_address:7066 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9099:9099 --ipc=host opea/asr:latest + sleep 3m +} + +function validate_microservice() { + result=$(http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json') + if [[ $result == *"you"* ]]; then + echo "Result correct." + else + echo "Result wrong." + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-asr*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_tts_speecht5.sh b/tests/test_tts_speecht5.sh new file mode 100644 index 000000000..66af28748 --- /dev/null +++ b/tests/test_tts_speecht5.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/speecht5:latest -f comps/tts/speecht5/Dockerfile . + docker build -t opea/tts:latest -f comps/tts/Dockerfile . +} + +function start_service() { + unset http_proxy + docker run -d --name="test-comps-tts-speecht5" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 7055:7055 --ipc=host opea/speecht5:latest + docker run -d --name="test-comps-tts" -e TTS_ENDPOINT=http://$ip_address:7055 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9088:9088 --ipc=host opea/tts:latest + sleep 3m +} + +function validate_microservice() { + result=$(http_proxy="" curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json') + if [[ $result == *"AAA"* ]]; then + echo "Result correct." + else + echo "Result wrong." + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-tts*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main