From 344219299fdce88d6e435a23fa0a621b91ea8ac8 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 00:33:41 -0700 Subject: [PATCH 1/8] add asr/tts component for xeon and hpu Signed-off-by: Spycsh --- comps/asr/Dockerfile | 4 - comps/asr/README.md | 79 +++++++++++++-- comps/asr/asr.py | 96 +++--------------- comps/asr/check_asr_server.py | 28 ++++++ comps/asr/requirements.txt | 3 +- comps/asr/whisper/Dockerfile | 23 +++++ comps/asr/whisper/Dockerfile_hpu | 27 +++++ comps/asr/whisper/__init__.py | 2 + comps/asr/whisper/check_whisper_server.py | 31 ++++++ comps/asr/whisper/whisper_model.py | 115 ++++++++++++++++++++++ comps/asr/whisper/whisper_server.py | 68 +++++++++++++ comps/lvms/README.md | 16 +-- comps/tts/Dockerfile | 10 -- comps/tts/README.md | 76 +++++++++++--- comps/tts/requirements.txt | 2 + comps/tts/speecht5/Dockerfile | 24 +++++ comps/tts/speecht5/Dockerfile_hpu | 28 ++++++ comps/tts/speecht5/__init__.py | 2 + comps/tts/speecht5/speecht5_model.py | 95 ++++++++++++++++++ comps/tts/speecht5/speecht5_server.py | 60 +++++++++++ comps/tts/tts.py | 111 +++------------------ 21 files changed, 674 insertions(+), 226 deletions(-) create mode 100644 comps/asr/check_asr_server.py create mode 100644 comps/asr/whisper/Dockerfile create mode 100644 comps/asr/whisper/Dockerfile_hpu create mode 100644 comps/asr/whisper/__init__.py create mode 100644 comps/asr/whisper/check_whisper_server.py create mode 100644 comps/asr/whisper/whisper_model.py create mode 100644 comps/asr/whisper/whisper_server.py create mode 100644 comps/tts/speecht5/Dockerfile create mode 100644 comps/tts/speecht5/Dockerfile_hpu create mode 100644 comps/tts/speecht5/__init__.py create mode 100644 comps/tts/speecht5/speecht5_model.py create mode 100644 comps/tts/speecht5/speecht5_server.py diff --git a/comps/asr/Dockerfile b/comps/asr/Dockerfile index 8b11e6389..fd1db5806 100644 --- a/comps/asr/Dockerfile +++ b/comps/asr/Dockerfile @@ -5,10 +5,6 @@ FROM python:3.11-slim ENV LANG C.UTF-8 -# Install system dependencies -RUN apt-get update \ - && apt-get install -y ffmpeg - COPY comps /home/comps RUN pip install --no-cache-dir --upgrade pip && \ diff --git a/comps/asr/README.md b/comps/asr/README.md index 295683c79..a280762d0 100644 --- a/comps/asr/README.md +++ b/comps/asr/README.md @@ -12,35 +12,94 @@ To start the ASR microservice with Python, you need to first install python pack pip install -r requirements.txt ``` -## 1.2 Start ASR Service with Python Script +## 1.2 Start Whisper Service/Test + +- Xeon CPU + +```bash +cd whisper/ +nohup python whisper_server.py --device=cpu & +python check_whisper_server.py +``` + +- Gaudi2 HPU + +```bash +pip install optimum[habana] + +cd whisper/ +nohup python whisper_server.py --device=hpu & +python check_whisper.py +``` + +## 1.3 Start ASR Service/Test ```bash python asr.py +python check_asr.py ``` # 🚀2. Start Microservice with Docker (Option 2) Alternatively, you can also start the ASR microservice with Docker. -## 2.1 Build Docker Image +## 2.1 Build Images + +### 2.1.1 Whisper Server Image + +- Xeon CPU + +```bash +cd ../.. +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile . +``` + +- Gaudi2 HPU + +```bash +cd ../.. +docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/Dockerfile_hpu . +``` + +### 2.1.2 ASR Service Image ```bash -cd ../../ docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/Dockerfile . ``` -## 2.2 Run Docker with CLI +## 2.2 Start Whisper and ASR Service + + +### 2.2.1 Start Whisper Server + +- Xeon + +```bash +docker run -p 7066:7066 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest +``` + +- Gaudi2 HPU ```bash -docker run -p 9099:9099 --network=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/asr:latest +docker run -p 7066:7066 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/whisper:latest ``` -# 🚀3. Consume ASR Service +### 2.2.2 Start ASR service + +```bash +ip_address=$(hostname -I | awk '{print $1}') + +docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ASR_ENDPOINT=http://$ip_address:7066 opea/asr:latest +``` -You can use the following `curl` command to test whether the service is up. Notice that the first request can be slow because it needs to download the models. +### 2.2.3 Test ```bash -curl http://localhost:9099/v1/audio/transcriptions \ - -H "Content-Type: application/json" \ - -d '{"url": "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample_2.wav"}' +# Use curl/python + +# curl +http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json' + +# python +python check_asr_server.py ``` diff --git a/comps/asr/asr.py b/comps/asr/asr.py index 2acccdc7d..2c107974c 100644 --- a/comps/asr/asr.py +++ b/comps/asr/asr.py @@ -1,78 +1,12 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import contextlib import os import time - import numpy as np -import torch -from datasets import Audio, Dataset -from pydub import AudioSegment -from transformers import WhisperForConditionalGeneration, WhisperProcessor - -from comps import Audio2TextDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice - - -@opea_telemetry -def _audiosegment_to_librosawav(audiosegment): - channel_sounds = audiosegment.split_to_mono()[:1] # only select the first channel - samples = [s.get_array_of_samples() for s in channel_sounds] +import requests, json - fp_arr = np.array(samples).T.astype(np.float32) - fp_arr /= np.iinfo(samples[0].typecode).max - fp_arr = fp_arr.reshape(-1) - - return fp_arr - - -@opea_telemetry -def audio2text( - audio_path, - model_name_or_path="openai/whisper-small", - language=None, - bf16=False, - device="cpu", -): - """Convert audio to text.""" - start = time.time() - model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(device) - processor = WhisperProcessor.from_pretrained(model_name_or_path) - model.eval() - bf16 = bf16 - if bf16: - import intel_extension_for_pytorch as ipex - - model = ipex.optimize(model, dtype=torch.bfloat16) - language = language - - try: - waveform = AudioSegment.from_file(audio_path).set_frame_rate(16000) - waveform = _audiosegment_to_librosawav(waveform) - except Exception as e: - print(f"[ASR] audiosegment to librosa wave fail: {e}") - audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000)) - waveform = audio_dataset[0]["audio"]["array"] - - inputs = processor.feature_extractor(waveform, return_tensors="pt", sampling_rate=16_000).input_features.to(device) - with torch.cpu.amp.autocast() if bf16 else contextlib.nullcontext(): - if language is None: - predicted_ids = model.generate(inputs) - elif language == "auto": - model.config.forced_decoder_ids = None - predicted_ids = model.generate(inputs) - else: - forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") - model.config.forced_decoder_ids = forced_decoder_ids - predicted_ids = model.generate(inputs) - - result = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0] - if language == "auto" or language == "zh": - from zhconv import convert - - result = convert(result, "zh-cn") - print(f"generated text in {time.time() - start} seconds, and the result is: {result}") - return result +from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, register_microservice, register_statistics, statistics_dict @register_microservice( @@ -81,26 +15,22 @@ def audio2text( endpoint="/v1/audio/transcriptions", host="0.0.0.0", port=9099, - input_datatype=Audio2TextDoc, + input_datatype=Base64ByteStrDoc, output_datatype=TextDoc, ) -@opea_telemetry -async def audio_to_text(audio: Audio2TextDoc): - audio.tensor, audio.frame_rate = audio.url.load() # AudioNdArray, fr - audio_path = f"{audio.id}.wav" - audio.tensor.save(audio_path, frame_rate=16000) +@register_statistics(names=["opea_service@asr"]) +async def audio_to_text(audio: Base64ByteStrDoc): + start = time.time() + byte_str = audio.byte_str + inputs = {"audio": byte_str} + + response = requests.post(url=f"{asr_endpoint}/v1/asr", data=json.dumps(inputs), proxies={"http": None}) - try: - asr_result = audio2text(audio_path, model_name_or_path=audio.model_name_or_path, language=audio.language) - except Exception as e: - print(e) - asr_result = e - finally: - os.remove(audio_path) - res = TextDoc(text=asr_result) - return res + statistics_dict["opea_service@asr"].append_latency(time.time() - start, None) + return TextDoc(text=response.json()["asr_result"]) if __name__ == "__main__": + asr_endpoint = os.getenv("ASR_ENDPOINT", "http://localhost:7066") print("[asr - router] ASR initialized.") opea_microservices["opea_service@asr"].start() diff --git a/comps/asr/check_asr_server.py b/comps/asr/check_asr_server.py new file mode 100644 index 000000000..3f6ed09da --- /dev/null +++ b/comps/asr/check_asr_server.py @@ -0,0 +1,28 @@ +import base64 +import json +from io import BytesIO + +import requests +import uuid +import urllib.request +import os + +# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa +# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" + +uid = str(uuid.uuid4()) +file_name = uid + ".wav" + +urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + file_name, +) + +with open(file_name, "rb") as f: + test_audio_base64_str = base64.b64encode(f.read()).decode('utf-8') +os.remove(file_name) + +endpoint = "http://localhost:9099/v1/audio/transcriptions" +inputs = {"byte_str": test_audio_base64_str} +response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) +print(response.json()) \ No newline at end of file diff --git a/comps/asr/requirements.txt b/comps/asr/requirements.txt index aee4f7603..c8f74ba68 100644 --- a/comps/asr/requirements.txt +++ b/comps/asr/requirements.txt @@ -1,7 +1,6 @@ datasets docarray[full] fastapi -intel_extension_for_pytorch opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk @@ -10,3 +9,5 @@ shortuuid torch transformers zhconv +optimum[habana] +pydantic==2.7.2 \ No newline at end of file diff --git a/comps/asr/whisper/Dockerfile b/comps/asr/whisper/Dockerfile new file mode 100644 index 000000000..c3e2a0025 --- /dev/null +++ b/comps/asr/whisper/Dockerfile @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg + +COPY comps /home/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/asr/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/asr/whisper + +ENTRYPOINT ["python", "whisper_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu new file mode 100644 index 000000000..a0339c19c --- /dev/null +++ b/comps/asr/whisper/Dockerfile_hpu @@ -0,0 +1,27 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu +RUN rm -rf /etc/ssh/ssh_host* + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg + +COPY comps /home/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/asr/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/asr/whisper + +ENTRYPOINT ["python", "llava_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/asr/whisper/__init__.py b/comps/asr/whisper/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/asr/whisper/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/asr/whisper/check_whisper_server.py b/comps/asr/whisper/check_whisper_server.py new file mode 100644 index 000000000..415cdd47f --- /dev/null +++ b/comps/asr/whisper/check_whisper_server.py @@ -0,0 +1,31 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +from io import BytesIO + +import requests +import uuid +import urllib.request +import os + +# https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa +# test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" + +uid = str(uuid.uuid4()) +file_name = uid + ".wav" + +urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + file_name, +) + +with open(file_name, "rb") as f: + test_audio_base64_str = base64.b64encode(f.read()).decode('utf-8') +os.remove(file_name) + +endpoint = "http://localhost:7066/v1/asr" +inputs = {"audio": test_audio_base64_str} +response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) +print(response.json()) \ No newline at end of file diff --git a/comps/asr/whisper/whisper_model.py b/comps/asr/whisper/whisper_model.py new file mode 100644 index 000000000..f553c979b --- /dev/null +++ b/comps/asr/whisper/whisper_model.py @@ -0,0 +1,115 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import contextlib +import os +import time +import urllib.request + +import numpy as np +import torch +from datasets import Audio, Dataset +from pydub import AudioSegment +from transformers import WhisperForConditionalGeneration, WhisperProcessor + + +class WhisperModel: + """Convert audio to text.""" + + def __init__(self, model_name_or_path="openai/whisper-small", language="english", device="cpu"): + if device == "hpu": + # Explicitly link HPU with Torch + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + self.device = device + asr_model_name_or_path = os.environ.get("ASR_MODEL_PATH", model_name_or_path) + print("Downloading model: {}".format(asr_model_name_or_path)) + self.model = WhisperForConditionalGeneration.from_pretrained(asr_model_name_or_path).to(self.device) + self.processor = WhisperProcessor.from_pretrained(asr_model_name_or_path) + self.model.eval() + + self.language = language + + if device == "hpu": + # do hpu graph warmup with a long enough input audio + # whisper has a receptive field of 30 seconds + # here we select a relatively long audio (~15 sec) to quickly warmup + self._warmup_whisper_hpu_graph("https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav") + + def _audiosegment_to_librosawav(self, audiosegment): + # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples + # This way is faster than librosa.load or HuggingFace Dataset wrapper + channel_sounds = audiosegment.split_to_mono()[:1] # only select the first channel + samples = [s.get_array_of_samples() for s in channel_sounds] + + fp_arr = np.array(samples).T.astype(np.float32) + fp_arr /= np.iinfo(samples[0].typecode).max + fp_arr = fp_arr.reshape(-1) + + return fp_arr + + def _warmup_whisper_hpu_graph(self, url): + print("[ASR] fetch warmup audio...") + urllib.request.urlretrieve( + url, + "warmup.wav", + ) + print("[ASR] warmup...") + waveform = AudioSegment.from_file("warmup.wav").set_frame_rate(16000) + waveform = self._audiosegment_to_librosawav(waveform) + # pylint: disable=E1101 + inputs = self.processor.feature_extractor( + waveform, return_tensors="pt", sampling_rate=16_000 + ).input_features.to(self.device) + _ = self.model.generate(inputs, language="chinese") + + def audio2text(self, audio_path): + """Convert audio to text. + + audio_path: the path to the input audio, e.g. ~/xxx.mp3 + """ + start = time.time() + + try: + waveform = AudioSegment.from_file(audio_path).set_frame_rate(16000) + waveform = self._audiosegment_to_librosawav(waveform) + except Exception as e: + print(f"[ASR] audiosegment to librosa wave fail: {e}") + audio_dataset = Dataset.from_dict({"audio": [audio_path]}).cast_column("audio", Audio(sampling_rate=16000)) + waveform = audio_dataset[0]["audio"]["array"] + + # pylint: disable=E1101 + inputs = self.processor.feature_extractor( + waveform, return_tensors="pt", sampling_rate=16_000 + ).input_features.to(self.device) + predicted_ids = self.model.generate(inputs, language=self.language) + # pylint: disable=E1101 + result = self.processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True, normalize=True)[0] + if self.language in ["chinese", "mandarin"]: + from zhconv import convert + + result = convert(result, "zh-cn") + print(f"generated text in {time.time() - start} seconds, and the result is: {result}") + return result + + +if __name__ == "__main__": + asr = WhisperModel(language="english") + + # Test multilanguage asr + urllib.request.urlretrieve( + "https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav", + "sample.wav", + ) + asr.language = "chinese" + text = asr.audio2text("sample.wav") + + urllib.request.urlretrieve( + "https://github.com/intel/intel-extension-for-transformers/raw/main/intel_extension_for_transformers/neural_chat/assets/audio/sample.wav", + "sample.wav", + ) + text = asr.audio2text("sample.wav") + + os.remove("sample.wav") \ No newline at end of file diff --git a/comps/asr/whisper/whisper_server.py b/comps/asr/whisper/whisper_server.py new file mode 100644 index 000000000..b34d57a18 --- /dev/null +++ b/comps/asr/whisper/whisper_server.py @@ -0,0 +1,68 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os + +import uvicorn +from whisper_model import WhisperModel +from fastapi import FastAPI, Request + +from fastapi.responses import Response +from pydub import AudioSegment +from starlette.middleware.cors import CORSMiddleware + +from io import BytesIO +import base64 +import uuid + +app = FastAPI() +asr = None + +app.add_middleware( + CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] +) + + +@app.get("/v1/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/v1/asr") +async def audio_to_text(request: Request): + print("Whisper generation begin.") + uid = str(uuid.uuid4()) + request_dict = await request.json() + audio_b64_str = request_dict.pop("audio") + audio = AudioSegment.from_file(BytesIO(base64.b64decode(audio_b64_str))) + + audio = audio.set_frame_rate(16000) + # bytes to wav + file_name = uid + ".wav" + audio.export(f"{file_name}", format="wav") + try: + asr_result = asr.audio2text(file_name) + except Exception as e: + print(e) + asr_result = e + finally: + os.remove(file_name) + return {"asr_result": asr_result} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=7066) + parser.add_argument("--model_name_or_path", type=str, default="openai/whisper-small") + parser.add_argument("--language", type=str, default="english") + parser.add_argument("--device", type=str, default="cpu") + + args = parser.parse_args() + asr = WhisperModel( + model_name_or_path=args.model_name_or_path, language=args.language, device=args.device + ) + + uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file diff --git a/comps/lvms/README.md b/comps/lvms/README.md index 67854885f..8f8237180 100644 --- a/comps/lvms/README.md +++ b/comps/lvms/README.md @@ -47,11 +47,11 @@ python lvm.py python check_lvm.py ``` -# 🚀1. Start Microservice with Docker (Option 2) +# 🚀2. Start Microservice with Docker (Option 2) -## 1.2 Build Images +## 2.1 Build Images -### 1.2.1 LLaVA Server Image +### 2.1.1 LLaVA Server Image - Xeon CPU @@ -67,16 +67,16 @@ cd ../.. docker build -t opea/llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llava/Dockerfile_hpu . ``` -### 1.2.2 LVM Service Image +### 2.1.2 LVM Service Image ```bash cd ../.. docker build -t opea/lvm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/Dockerfile . ``` -## 1.3 Start LLaVA and LVM Service +## 2.2 Start LLaVA and LVM Service -### 1.3.1 Start LLaVA server +### 2.2.1 Start LLaVA server - Xeon @@ -90,7 +90,7 @@ docker run -p 8399:8399 -e http_proxy=$http_proxy --ipc=host -e https_proxy=$htt docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llava:latest ``` -### 1.3.2 Start LVM service +### 2.2.2 Start LVM service ```bash ip_address=$(hostname -I | awk '{print $1}') @@ -98,7 +98,7 @@ ip_address=$(hostname -I | awk '{print $1}') docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm:latest ``` -### 1.3.3 Test +### 2.2.3 Test ```bash # Use curl/python diff --git a/comps/tts/Dockerfile b/comps/tts/Dockerfile index 3ad17b8f8..50dd9e15f 100644 --- a/comps/tts/Dockerfile +++ b/comps/tts/Dockerfile @@ -5,21 +5,11 @@ FROM python:3.11-slim ENV LANG C.UTF-8 -# Install system dependencies -RUN apt-get update \ - && apt-get install -y ffmpeg \ - && apt-get install -y curl \ - && apt-get install -y libomp-dev google-perftools - COPY comps /home/comps RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r /home/comps/tts/requirements.txt -ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libiomp5.so:/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 -ENV MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000" -ENV OMP_NUM_THREADS=56 - ENV PYTHONPATH=$PYTHONPATH:/home WORKDIR /home/comps/tts diff --git a/comps/tts/README.md b/comps/tts/README.md index b0aa32456..66519264b 100644 --- a/comps/tts/README.md +++ b/comps/tts/README.md @@ -2,45 +2,91 @@ TTS (Text-To-Speech) microservice helps users convert text to speech. When building a talking bot with LLM, users might need an LLM generated answer in audio format. This microservice is built for that conversion stage. -# 🚀1. Start Microservice with Python (Option 1) +## 1.2 Start SpeechT5 Service/Test -To start the TTS microservice, you need to first install python packages. +- Xeon CPU -## 1.1 Install Requirements +```bash +cd speecht5/ +nohup python speecht5_server.py --device=cpu & +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' +``` + +- Gaudi2 HPU ```bash -pip install -r requirements.txt +pip install optimum[habana] + +cd speecht5/ +nohup python speecht5_server.py --device=hpu & +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` -## 1.2 Start TTS Service with Python Script +## 1.3 Start TTS Service/Test ```bash python tts.py + +curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` + # 🚀2. Start Microservice with Docker (Option 2) -Alternatively, you can start the ASR microservice with Docker. +Alternatively, you can start the TTS microservice with Docker. + +## 2.1 Build Images -## 2.1 Build Docker Image +### 2.1.1 Whisper Server Image + +- Xeon CPU + +```bash +cd ../.. +docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile . +``` + +- Gaudi2 HPU + +```bash +cd ../.. +docker build -t opea/speecht5:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/speecht5/Dockerfile_hpu . +``` + +### 2.1.2 TTS Service Image ```bash -cd ../../ docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/tts/Dockerfile . ``` -## 2.2 Run Docker with CLI +## 2.2 Start SpeechT5 and TTS Service + + +### 2.2.1 Start SpeechT5 Server + +- Xeon ```bash -docker run -p 9999:9999 --network=host --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/tts:latest +docker run -p 7055:7055 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/speecht5:latest ``` -# 🚀3. Consume TTS Service +- Gaudi2 HPU + +```bash +docker run -p 7055:7055 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/speecht5:latest +``` + +### 2.2.2 Start TTS service + +```bash +ip_address=$(hostname -I | awk '{print $1}') + +docker run -p 9088:9088 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TTS_ENDPOINT=http://$ip_address:7055 opea/tts:latest +``` -You can use the following `curl` command to test whether the service is up. Notice that the first request can be slow because it needs to download the models. +### 2.2.3 Test ```bash -curl http://localhost:9999/v1/audio/speech \ - -H "Content-Type: application/json" \ - -d '{"text":"Hello there."}' +# curl +http_proxy="" curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` diff --git a/comps/tts/requirements.txt b/comps/tts/requirements.txt index 68f2c0128..8becc2c8e 100644 --- a/comps/tts/requirements.txt +++ b/comps/tts/requirements.txt @@ -7,3 +7,5 @@ sentencepiece shortuuid torch transformers +optimum[habana] +aiohttp \ No newline at end of file diff --git a/comps/tts/speecht5/Dockerfile b/comps/tts/speecht5/Dockerfile new file mode 100644 index 000000000..e4afd07db --- /dev/null +++ b/comps/tts/speecht5/Dockerfile @@ -0,0 +1,24 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg \ + && apt-get install -y curl + +COPY comps /home/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/tts/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/tts/speecht5 + +ENTRYPOINT ["python", "speecht5_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/tts/speecht5/Dockerfile_hpu b/comps/tts/speecht5/Dockerfile_hpu new file mode 100644 index 000000000..3b27926c3 --- /dev/null +++ b/comps/tts/speecht5/Dockerfile_hpu @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu +RUN rm -rf /etc/ssh/ssh_host* + +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +# Install system dependencies +RUN apt-get update \ + && apt-get install -y ffmpeg \ + && apt-get install -y curl + +COPY comps /home/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/comps/tts/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home + +WORKDIR /home/comps/tts/speecht5 + +ENTRYPOINT ["python", "speecht5_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/tts/speecht5/__init__.py b/comps/tts/speecht5/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/tts/speecht5/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/tts/speecht5/speecht5_model.py b/comps/tts/speecht5/speecht5_model.py new file mode 100644 index 000000000..95127f947 --- /dev/null +++ b/comps/tts/speecht5/speecht5_model.py @@ -0,0 +1,95 @@ +from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor +import torch +import os + +import numpy as np +import torch +import subprocess + +class SpeechT5Model(): + def __init__(self, device="cpu"): + self.device = device + if self.device == "hpu": + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + adapt_transformers_to_gaudi() + # do hpu graph warmup with variable inputs + self._warmup_speecht5_hpu_graph() + + + model_name_or_path = "microsoft/speecht5_tts" + vocoder_model_name_or_path = "microsoft/speecht5_hifigan" + self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name_or_path).to(device) + self.model.eval() + self.processor = SpeechT5Processor.from_pretrained(model_name_or_path, normalize=True) + self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device) + self.vocoder.eval() + + # fetch default speaker embedding + if os.path.exists("spk_embed_default.pt"): + self.default_speaker_embedding = torch.load("spk_embed_default.pt") + else: + try: + p = subprocess.Popen( + [ + "curl", + "-O", + "https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/" + "intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/" + "spk_embed_default.pt", + ] + ) + p.wait() + self.default_speaker_embedding = torch.load("spk_embed_default.pt") + except Exception as e: + print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.") + self.default_speaker_embedding = torch.zeros((1, 512)) + + def split_long_text_into_batch(self, text, batch_length=128): + """Batch the long text into sequences of shorter sentences.""" + res = [] + hitted_ends = [",", ".", "?", "!", "。", ";", " "] + idx = 0 + cur_start = 0 + cur_end = -1 + while idx < len(text): + if idx - cur_start > batch_length: + if cur_end != -1 and cur_end > cur_start: + res.append(text[cur_start : cur_end + 1]) + else: + cur_end = cur_start + batch_length - 1 + res.append(text[cur_start : cur_end + 1]) + idx = cur_end + cur_start = cur_end + 1 + if text[idx] in hitted_ends: + cur_end = idx + idx += 1 + # deal with the last sequence + res.append(text[cur_start:idx]) + res = [i + "." for i in res] # avoid unexpected end of sequence + return res + + def _warmup_speecht5_hpu_graph(self): + self.t2s("Hello, how can I help you today?") + self.t2s("OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies.") + self.t2s("OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies & workflows leading to quicker GenAI adoption and business value.") + + def t2s(self, text): + if self.device == "hpu": + # See https://github.com/huggingface/optimum-habana/pull/824 + from optimum.habana.utils import set_seed + set_seed(555) + all_speech = np.array([]) + text = self.split_long_text_into_batch(text, batch_length=100) + inputs = self.processor(text=text, padding=True, max_length=128, return_tensors="pt") + with torch.no_grad(): + waveforms, waveform_lengths = self.model.generate_speech( + inputs["input_ids"].to(self.device), + speaker_embeddings=self.default_speaker_embedding.to(self.device), + attention_mask=inputs["attention_mask"].to(self.device), + vocoder=self.vocoder, + return_output_lengths=True, + ) + for i in range(waveforms.size(0)): + all_speech = np.concatenate([all_speech, waveforms[i][: waveform_lengths[i]].cpu().numpy()]) + + return all_speech \ No newline at end of file diff --git a/comps/tts/speecht5/speecht5_server.py b/comps/tts/speecht5/speecht5_server.py new file mode 100644 index 000000000..851b30b63 --- /dev/null +++ b/comps/tts/speecht5/speecht5_server.py @@ -0,0 +1,60 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse + +import uvicorn +from speecht5_model import SpeechT5Model +from fastapi import FastAPI, Request + +from fastapi.responses import Response +from starlette.middleware.cors import CORSMiddleware + +from io import BytesIO +import base64 + +"""Test: + +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' +""" + +app = FastAPI() +tts = None + +app.add_middleware( + CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"] +) + + +@app.get("/v1/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/v1/tts") +async def text_to_speech(request: Request): + print("SpeechT5 generation begin.") + request_dict = await request.json() + text = request_dict.pop("text") + + speech = tts.t2s(text) + + buffered = BytesIO() + buffered.write(speech.tobytes()) + return {"tts_result": base64.b64encode(buffered.getvalue())} + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--port", type=int, default=7055) + parser.add_argument("--device", type=str, default="cpu") + + args = parser.parse_args() + tts = SpeechT5Model( + device=args.device + ) + + uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file diff --git a/comps/tts/tts.py b/comps/tts/tts.py index 11891cd04..2e15bbdba 100644 --- a/comps/tts/tts.py +++ b/comps/tts/tts.py @@ -1,116 +1,37 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import base64 -import os -import time -from io import BytesIO +import os, json, time +import requests -import numpy as np -import torch -from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor +from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, register_microservice, register_statistics, statistics_dict -from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, opea_telemetry, register_microservice - - -@opea_telemetry -def split_long_text_into_batch(text, batch_length=128): - """Batch the long text into sequences of shorter sentences.""" - res = [] - hitted_ends = [",", ".", "?", "!", "。", ";", " "] - idx = 0 - cur_start = 0 - cur_end = -1 - while idx < len(text): - if idx - cur_start > batch_length: - if cur_end != -1 and cur_end > cur_start: - res.append(text[cur_start : cur_end + 1]) - else: - cur_end = cur_start + batch_length - 1 - res.append(text[cur_start : cur_end + 1]) - idx = cur_end - cur_start = cur_end + 1 - if text[idx] in hitted_ends: - cur_end = idx - idx += 1 - # deal with the last sequence - res.append(text[cur_start:idx]) - res = [i + "." for i in res] # avoid unexpected end of sequence - return res - - -@opea_telemetry -def text2speech( - text, - model_name_or_path="microsoft/speecht5_tts", - vocoder_model_name_or_path="microsoft/speecht5_hifigan", - output_audio_path="./response.wav", - device="cpu", -): - start = time.time() - model = SpeechT5ForTextToSpeech.from_pretrained(model_name_or_path).to(device) - model.eval() - processor = SpeechT5Processor.from_pretrained(model_name_or_path, normalize=True) - vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device) - vocoder.eval() - - if os.path.exists("spk_embed_default.pt"): - default_speaker_embedding = torch.load("spk_embed_default.pt") - else: # pragma: no cover - import subprocess - - try: - p = subprocess.Popen( - [ - "curl", - "-O", - "https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/" - "intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/" - "spk_embed_default.pt", - ] - ) - p.wait() - default_speaker_embedding = torch.load("spk_embed_default.pt") - except Exception as e: - print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.") - default_speaker_embedding = torch.zeros((1, 512)) - - all_speech = np.array([]) - text = split_long_text_into_batch(text, batch_length=100) - inputs = processor(text=text, padding=True, max_length=128, return_tensors="pt") - with torch.no_grad(): - waveforms, waveform_lengths = model.generate_speech( - inputs["input_ids"].to(device), - speaker_embeddings=default_speaker_embedding.to(device), - attention_mask=inputs["attention_mask"].to(device), - vocoder=vocoder, - return_output_lengths=True, - ) - for i in range(waveforms.size(0)): - all_speech = np.concatenate([all_speech, waveforms[i][: waveform_lengths[i]].cpu().numpy()]) - - print(f"generated speech in {time.time() - start} seconds") - return all_speech +"""Test: +curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' +""" @register_microservice( name="opea_service@tts", service_type=ServiceType.TTS, endpoint="/v1/audio/speech", host="0.0.0.0", - port=9999, + port=9088, input_datatype=TextDoc, - output_datatype=TextDoc, + output_datatype=Base64ByteStrDoc, ) -@opea_telemetry +@register_statistics(names=["opea_service@tts"]) async def text_to_audio(input: TextDoc): + start = time.time() text = input.text - speech = text2speech(text=text) - buffered = BytesIO() - buffered.write(speech.tobytes()) - return Base64ByteStrDoc(byte_str=base64.b64encode(buffered.getvalue())) + inputs = {"text": text} + + response = requests.post(url=f"{tts_endpoint}/v1/tts", data=json.dumps(inputs), proxies={"http": None}) + statistics_dict["opea_service@tts"].append_latency(time.time() - start, None) + return Base64ByteStrDoc(byte_str=response.json()["tts_result"]) if __name__ == "__main__": + tts_endpoint = os.getenv("TTS_ENDPOINT", "http://localhost:7055") print("[tts - router] TTS initialized.") opea_microservices["opea_service@tts"].start() From 830da039a671f1468c3c26b3d177e81f33997019 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 20 Jun 2024 07:36:11 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/asr/README.md | 1 - comps/asr/asr.py | 16 +++++++++++++--- comps/asr/check_asr_server.py | 13 ++++++++----- comps/asr/requirements.txt | 4 ++-- comps/asr/whisper/check_whisper_server.py | 10 +++++----- comps/asr/whisper/whisper_model.py | 2 +- comps/asr/whisper/whisper_server.py | 16 ++++++---------- comps/tts/README.md | 2 -- comps/tts/requirements.txt | 4 ++-- comps/tts/speecht5/speecht5_model.py | 22 ++++++++++++++-------- comps/tts/speecht5/speecht5_server.py | 15 +++++---------- comps/tts/tts.py | 16 ++++++++++++++-- 12 files changed, 70 insertions(+), 51 deletions(-) diff --git a/comps/asr/README.md b/comps/asr/README.md index a280762d0..681d71a69 100644 --- a/comps/asr/README.md +++ b/comps/asr/README.md @@ -69,7 +69,6 @@ docker build -t opea/asr:latest --build-arg https_proxy=$https_proxy --build-arg ## 2.2 Start Whisper and ASR Service - ### 2.2.1 Start Whisper Server - Xeon diff --git a/comps/asr/asr.py b/comps/asr/asr.py index 2c107974c..97fbb0bb0 100644 --- a/comps/asr/asr.py +++ b/comps/asr/asr.py @@ -1,12 +1,22 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import json import os import time -import numpy as np -import requests, json -from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, register_microservice, register_statistics, statistics_dict +import numpy as np +import requests + +from comps import ( + Base64ByteStrDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) @register_microservice( diff --git a/comps/asr/check_asr_server.py b/comps/asr/check_asr_server.py index 3f6ed09da..d51bc8569 100644 --- a/comps/asr/check_asr_server.py +++ b/comps/asr/check_asr_server.py @@ -1,11 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import base64 import json +import os +import urllib.request +import uuid from io import BytesIO import requests -import uuid -import urllib.request -import os # https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa # test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" @@ -19,10 +22,10 @@ ) with open(file_name, "rb") as f: - test_audio_base64_str = base64.b64encode(f.read()).decode('utf-8') + test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") os.remove(file_name) endpoint = "http://localhost:9099/v1/audio/transcriptions" inputs = {"byte_str": test_audio_base64_str} response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) -print(response.json()) \ No newline at end of file +print(response.json()) diff --git a/comps/asr/requirements.txt b/comps/asr/requirements.txt index c8f74ba68..3c0c8be74 100644 --- a/comps/asr/requirements.txt +++ b/comps/asr/requirements.txt @@ -4,10 +4,10 @@ fastapi opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +optimum[habana] +pydantic==2.7.2 pydub shortuuid torch transformers zhconv -optimum[habana] -pydantic==2.7.2 \ No newline at end of file diff --git a/comps/asr/whisper/check_whisper_server.py b/comps/asr/whisper/check_whisper_server.py index 415cdd47f..1b338d08d 100644 --- a/comps/asr/whisper/check_whisper_server.py +++ b/comps/asr/whisper/check_whisper_server.py @@ -3,12 +3,12 @@ import base64 import json +import os +import urllib.request +import uuid from io import BytesIO import requests -import uuid -import urllib.request -import os # https://gist.github.com/novwhisky/8a1a0168b94f3b6abfaa # test_audio_base64_str = "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" @@ -22,10 +22,10 @@ ) with open(file_name, "rb") as f: - test_audio_base64_str = base64.b64encode(f.read()).decode('utf-8') + test_audio_base64_str = base64.b64encode(f.read()).decode("utf-8") os.remove(file_name) endpoint = "http://localhost:7066/v1/asr" inputs = {"audio": test_audio_base64_str} response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None}) -print(response.json()) \ No newline at end of file +print(response.json()) diff --git a/comps/asr/whisper/whisper_model.py b/comps/asr/whisper/whisper_model.py index f553c979b..0af9ebfcb 100644 --- a/comps/asr/whisper/whisper_model.py +++ b/comps/asr/whisper/whisper_model.py @@ -112,4 +112,4 @@ def audio2text(self, audio_path): ) text = asr.audio2text("sample.wav") - os.remove("sample.wav") \ No newline at end of file + os.remove("sample.wav") diff --git a/comps/asr/whisper/whisper_server.py b/comps/asr/whisper/whisper_server.py index b34d57a18..8cf078d3c 100644 --- a/comps/asr/whisper/whisper_server.py +++ b/comps/asr/whisper/whisper_server.py @@ -2,19 +2,17 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import base64 import os +import uuid +from io import BytesIO import uvicorn -from whisper_model import WhisperModel from fastapi import FastAPI, Request - from fastapi.responses import Response from pydub import AudioSegment from starlette.middleware.cors import CORSMiddleware - -from io import BytesIO -import base64 -import uuid +from whisper_model import WhisperModel app = FastAPI() asr = None @@ -61,8 +59,6 @@ async def audio_to_text(request: Request): parser.add_argument("--device", type=str, default="cpu") args = parser.parse_args() - asr = WhisperModel( - model_name_or_path=args.model_name_or_path, language=args.language, device=args.device - ) + asr = WhisperModel(model_name_or_path=args.model_name_or_path, language=args.language, device=args.device) - uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file + uvicorn.run(app, host=args.host, port=args.port) diff --git a/comps/tts/README.md b/comps/tts/README.md index 66519264b..342933d40 100644 --- a/comps/tts/README.md +++ b/comps/tts/README.md @@ -30,7 +30,6 @@ python tts.py curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` - # 🚀2. Start Microservice with Docker (Option 2) Alternatively, you can start the TTS microservice with Docker. @@ -61,7 +60,6 @@ docker build -t opea/tts:latest --build-arg https_proxy=$https_proxy --build-arg ## 2.2 Start SpeechT5 and TTS Service - ### 2.2.1 Start SpeechT5 Server - Xeon diff --git a/comps/tts/requirements.txt b/comps/tts/requirements.txt index 8becc2c8e..7baa60a17 100644 --- a/comps/tts/requirements.txt +++ b/comps/tts/requirements.txt @@ -1,11 +1,11 @@ +aiohttp docarray[full] fastapi opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk +optimum[habana] sentencepiece shortuuid torch transformers -optimum[habana] -aiohttp \ No newline at end of file diff --git a/comps/tts/speecht5/speecht5_model.py b/comps/tts/speecht5/speecht5_model.py index 95127f947..e4a36b169 100644 --- a/comps/tts/speecht5/speecht5_model.py +++ b/comps/tts/speecht5/speecht5_model.py @@ -1,21 +1,24 @@ -from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor -import torch +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import os +import subprocess import numpy as np import torch -import subprocess +from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor -class SpeechT5Model(): + +class SpeechT5Model: def __init__(self, device="cpu"): self.device = device if self.device == "hpu": from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + adapt_transformers_to_gaudi() # do hpu graph warmup with variable inputs self._warmup_speecht5_hpu_graph() - model_name_or_path = "microsoft/speecht5_tts" vocoder_model_name_or_path = "microsoft/speecht5_hifigan" self.model = SpeechT5ForTextToSpeech.from_pretrained(model_name_or_path).to(device) @@ -23,7 +26,7 @@ def __init__(self, device="cpu"): self.processor = SpeechT5Processor.from_pretrained(model_name_or_path, normalize=True) self.vocoder = SpeechT5HifiGan.from_pretrained(vocoder_model_name_or_path).to(device) self.vocoder.eval() - + # fetch default speaker embedding if os.path.exists("spk_embed_default.pt"): self.default_speaker_embedding = torch.load("spk_embed_default.pt") @@ -71,12 +74,15 @@ def split_long_text_into_batch(self, text, batch_length=128): def _warmup_speecht5_hpu_graph(self): self.t2s("Hello, how can I help you today?") self.t2s("OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies.") - self.t2s("OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies & workflows leading to quicker GenAI adoption and business value.") + self.t2s( + "OPEA is an ecosystem orchestration framework to integrate performant GenAI technologies & workflows leading to quicker GenAI adoption and business value." + ) def t2s(self, text): if self.device == "hpu": # See https://github.com/huggingface/optimum-habana/pull/824 from optimum.habana.utils import set_seed + set_seed(555) all_speech = np.array([]) text = self.split_long_text_into_batch(text, batch_length=100) @@ -92,4 +98,4 @@ def t2s(self, text): for i in range(waveforms.size(0)): all_speech = np.concatenate([all_speech, waveforms[i][: waveform_lengths[i]].cpu().numpy()]) - return all_speech \ No newline at end of file + return all_speech diff --git a/comps/tts/speecht5/speecht5_server.py b/comps/tts/speecht5/speecht5_server.py index 851b30b63..8ddf5a2da 100644 --- a/comps/tts/speecht5/speecht5_server.py +++ b/comps/tts/speecht5/speecht5_server.py @@ -2,17 +2,15 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import base64 +from io import BytesIO import uvicorn -from speecht5_model import SpeechT5Model from fastapi import FastAPI, Request - from fastapi.responses import Response +from speecht5_model import SpeechT5Model from starlette.middleware.cors import CORSMiddleware -from io import BytesIO -import base64 - """Test: curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' @@ -45,7 +43,6 @@ async def text_to_speech(request: Request): return {"tts_result": base64.b64encode(buffered.getvalue())} - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="0.0.0.0") @@ -53,8 +50,6 @@ async def text_to_speech(request: Request): parser.add_argument("--device", type=str, default="cpu") args = parser.parse_args() - tts = SpeechT5Model( - device=args.device - ) + tts = SpeechT5Model(device=args.device) - uvicorn.run(app, host=args.host, port=args.port) \ No newline at end of file + uvicorn.run(app, host=args.host, port=args.port) diff --git a/comps/tts/tts.py b/comps/tts/tts.py index 2e15bbdba..f1a017c21 100644 --- a/comps/tts/tts.py +++ b/comps/tts/tts.py @@ -1,16 +1,28 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import os, json, time +import json +import os +import time + import requests -from comps import Base64ByteStrDoc, ServiceType, TextDoc, opea_microservices, register_microservice, register_statistics, statistics_dict +from comps import ( + Base64ByteStrDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) """Test: curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' """ + @register_microservice( name="opea_service@tts", service_type=ServiceType.TTS, From d0d3507af56ecfbc97c1cc1fecd859dd0c96c0cc Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 01:02:30 -0700 Subject: [PATCH 3/8] fix --- comps/asr/whisper/Dockerfile_hpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu index a0339c19c..37fb9b824 100644 --- a/comps/asr/whisper/Dockerfile_hpu +++ b/comps/asr/whisper/Dockerfile_hpu @@ -24,4 +24,4 @@ ENV PYTHONPATH=$PYTHONPATH:/home WORKDIR /home/comps/asr/whisper -ENTRYPOINT ["python", "llava_server.py", "--device", "hpu"] \ No newline at end of file +ENTRYPOINT ["python", "whisper_server.py", "--device", "hpu"] \ No newline at end of file From d728b157f78c713935c2a53d4b1cd755f35a2429 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 02:09:10 -0700 Subject: [PATCH 4/8] fix --- comps/asr/README.md | 4 ++-- comps/asr/whisper/Dockerfile_hpu | 1 - comps/tts/speecht5/speecht5_model.py | 6 ++++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/comps/asr/README.md b/comps/asr/README.md index 681d71a69..e2a5cc858 100644 --- a/comps/asr/README.md +++ b/comps/asr/README.md @@ -29,14 +29,14 @@ pip install optimum[habana] cd whisper/ nohup python whisper_server.py --device=hpu & -python check_whisper.py +python check_whisper_server.py ``` ## 1.3 Start ASR Service/Test ```bash python asr.py -python check_asr.py +python check_asr_server.py ``` # 🚀2. Start Microservice with Docker (Option 2) diff --git a/comps/asr/whisper/Dockerfile_hpu b/comps/asr/whisper/Dockerfile_hpu index 37fb9b824..0af3b1e82 100644 --- a/comps/asr/whisper/Dockerfile_hpu +++ b/comps/asr/whisper/Dockerfile_hpu @@ -3,7 +3,6 @@ # HABANA environment FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1 AS hpu -RUN rm -rf /etc/ssh/ssh_host* # Set environment variables ENV LANG=en_US.UTF-8 diff --git a/comps/tts/speecht5/speecht5_model.py b/comps/tts/speecht5/speecht5_model.py index e4a36b169..3c2bbe68a 100644 --- a/comps/tts/speecht5/speecht5_model.py +++ b/comps/tts/speecht5/speecht5_model.py @@ -16,8 +16,6 @@ def __init__(self, device="cpu"): from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi adapt_transformers_to_gaudi() - # do hpu graph warmup with variable inputs - self._warmup_speecht5_hpu_graph() model_name_or_path = "microsoft/speecht5_tts" vocoder_model_name_or_path = "microsoft/speecht5_hifigan" @@ -47,6 +45,10 @@ def __init__(self, device="cpu"): print("Warning! Need to prepare speaker_embeddings, will use the backup embedding.") self.default_speaker_embedding = torch.zeros((1, 512)) + if self.device == "hpu": + # do hpu graph warmup with variable inputs + self._warmup_speecht5_hpu_graph() + def split_long_text_into_batch(self, text, batch_length=128): """Batch the long text into sequences of shorter sentences.""" res = [] From 97d9a6022ff402d6130c994a3376799b4d24ec1c Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 06:17:48 -0700 Subject: [PATCH 5/8] fix ffmpeg JSONDecode error on HPU --- comps/asr/whisper/whisper_server.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comps/asr/whisper/whisper_server.py b/comps/asr/whisper/whisper_server.py index 8cf078d3c..1a5c760d2 100644 --- a/comps/asr/whisper/whisper_server.py +++ b/comps/asr/whisper/whisper_server.py @@ -5,7 +5,6 @@ import base64 import os import uuid -from io import BytesIO import uvicorn from fastapi import FastAPI, Request @@ -32,13 +31,14 @@ async def health() -> Response: async def audio_to_text(request: Request): print("Whisper generation begin.") uid = str(uuid.uuid4()) + file_name = uid + ".wav" request_dict = await request.json() audio_b64_str = request_dict.pop("audio") - audio = AudioSegment.from_file(BytesIO(base64.b64decode(audio_b64_str))) + with open(file_name, "wb") as f: + f.write(base64.b64decode(audio_b64_str)) + audio = AudioSegment.from_file(file_name) audio = audio.set_frame_rate(16000) - # bytes to wav - file_name = uid + ".wav" audio.export(f"{file_name}", format="wav") try: asr_result = asr.audio2text(file_name) From 2ddb0a8abecc40d296156ed2408c1852ed9819c4 Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 07:18:49 -0700 Subject: [PATCH 6/8] add tests --- tests/test_asr_whisper.sh | 54 ++++++++++++++++++++++++++++++++++++++ tests/test_tts_speecht5.sh | 54 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 tests/test_asr_whisper.sh create mode 100644 tests/test_tts_speecht5.sh diff --git a/tests/test_asr_whisper.sh b/tests/test_asr_whisper.sh new file mode 100644 index 000000000..5e6e4a8c8 --- /dev/null +++ b/tests/test_asr_whisper.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/whisper:latest -f comps/asr/whisper/Dockerfile . + docker build -t opea/asr:latest -f comps/asr/Dockerfile . +} + +function start_service() { + unset http_proxy + docker run -d --name="test-comps-asr-whisper" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 7066:7066 --ipc=host opea/whisper:latest + docker run -d --name="test-comps-asr" -e ASR_ENDPOINT=http://$ip_address:7066 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9099:9099 --ipc=host opea/asr:latest + sleep 3m +} + +function validate_microservice() { + result=$(http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json') + if [[ $result == *"you"* ]]; then + echo "Result correct." + else + echo "Result wrong." + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-asr*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_tts_speecht5.sh b/tests/test_tts_speecht5.sh new file mode 100644 index 000000000..66af28748 --- /dev/null +++ b/tests/test_tts_speecht5.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build -t opea/speecht5:latest -f comps/tts/speecht5/Dockerfile . + docker build -t opea/tts:latest -f comps/tts/Dockerfile . +} + +function start_service() { + unset http_proxy + docker run -d --name="test-comps-tts-speecht5" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 7055:7055 --ipc=host opea/speecht5:latest + docker run -d --name="test-comps-tts" -e TTS_ENDPOINT=http://$ip_address:7055 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9088:9088 --ipc=host opea/tts:latest + sleep 3m +} + +function validate_microservice() { + result=$(http_proxy="" curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json') + if [[ $result == *"AAA"* ]]; then + echo "Result correct." + else + echo "Result wrong." + exit 1 + fi + +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-tts*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From dcb645c70d21714984d8450128a9816950017ead Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 18:47:00 -0700 Subject: [PATCH 7/8] trigger --- comps/asr/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/asr/README.md b/comps/asr/README.md index e2a5cc858..f5a197be1 100644 --- a/comps/asr/README.md +++ b/comps/asr/README.md @@ -94,7 +94,7 @@ docker run -d -p 9099:9099 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$ ### 2.2.3 Test ```bash -# Use curl/python +# Use curl or python # curl http_proxy="" curl http://localhost:9099/v1/audio/transcriptions -XPOST -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' -H 'Content-Type: application/json' From a5d39d03b47716aac1696fdbf59162203c603bbd Mon Sep 17 00:00:00 2001 From: Spycsh Date: Thu, 20 Jun 2024 19:10:43 -0700 Subject: [PATCH 8/8] try --- comps/tts/README.md | 4 +++- comps/tts/speecht5/speecht5_server.py | 5 ----- comps/tts/tts.py | 5 ----- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/comps/tts/README.md b/comps/tts/README.md index 342933d40..a26e72200 100644 --- a/comps/tts/README.md +++ b/comps/tts/README.md @@ -86,5 +86,7 @@ docker run -p 9088:9088 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$htt ```bash # curl -http_proxy="" curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' +curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' + +curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' ``` diff --git a/comps/tts/speecht5/speecht5_server.py b/comps/tts/speecht5/speecht5_server.py index 8ddf5a2da..467ee6b44 100644 --- a/comps/tts/speecht5/speecht5_server.py +++ b/comps/tts/speecht5/speecht5_server.py @@ -11,11 +11,6 @@ from speecht5_model import SpeechT5Model from starlette.middleware.cors import CORSMiddleware -"""Test: - -curl http://localhost:7055/v1/tts -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' -""" - app = FastAPI() tts = None diff --git a/comps/tts/tts.py b/comps/tts/tts.py index f1a017c21..6c6bad232 100644 --- a/comps/tts/tts.py +++ b/comps/tts/tts.py @@ -17,11 +17,6 @@ statistics_dict, ) -"""Test: - -curl http://localhost:9088/v1/audio/speech -XPOST -d '{"text": "Who are you?"}' -H 'Content-Type: application/json' -""" - @register_microservice( name="opea_service@tts",