Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support bark for TTS #100

Merged
merged 5 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lazyllm/common/globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __getattr__(self, __name: str) -> Any:
raise AttributeError(f'Attr {__name} not found in globals')

def clear(self):
self.__data.pop(self._sid)
self.__data.pop(self._sid, None)

def _clear_all(self):
self.__data.clear()
Expand Down
3 changes: 3 additions & 0 deletions lazyllm/components/auto/autodeploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .auto_helper import model_map, get_model_name, check_requirements
from lazyllm.components.embedding.embed import EmbeddingDeploy
from lazyllm.components.stable_diffusion.stable_diffusion3 import StableDiffusionDeploy
from lazyllm.components.text_to_speech.bark import BarkDeploy
from ..utils.downloader import ModelManager

class AutoDeploy(LazyLLMDeployBase):
Expand All @@ -20,6 +21,8 @@ def __new__(cls, base_model, source=lazyllm.config['model_source'], trust_remote
return EmbeddingDeploy(trust_remote_code, launcher)
elif type == 'sd' or ModelManager.get_model_type(model_name) == 'sd':
return StableDiffusionDeploy(launcher)
elif type == 'tts' or ModelManager.get_model_type(model_name) == 'tts':
return BarkDeploy(launcher)
map_name = model_map(model_name)
candidates = get_configer().query_deploy(lazyllm.config['gpu_type'], launcher.ngpus,
map_name, max_token_num)
Expand Down
Empty file.
70 changes: 70 additions & 0 deletions lazyllm/components/text_to_speech/bark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import json

import lazyllm
from lazyllm import LOG
from lazyllm.thirdparty import torch
from lazyllm.thirdparty import transformers as tf
from ..utils.downloader import ModelManager

class Bark(object):

def __init__(self, base_sd, source=None, trust_remote_code=True, init=False):
source = lazyllm.config['model_source'] if not source else source
self.base_sd = ModelManager(source).download(base_sd)
self.trust_remote_code = trust_remote_code
self.processor, self.bark = None, None
self.init_flag = lazyllm.once_flag()
self.device = 'cpu'
if init:
lazyllm.call_once(self.init_flag, self.load_bark)

def load_bark(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.processor = tf.AutoProcessor.from_pretrained(self.base_sd)
self.processor.speaker_embeddings['repo_or_path'] = self.base_sd
self.bark = tf.BarkModel.from_pretrained(self.base_sd,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2").to(self.device)

def __call__(self, string):
lazyllm.call_once(self.init_flag, self.load_bark)
if isinstance(string, str):
query = string
voice_preset = "v2/zh_speaker_9"
elif isinstance(string, dict):
query = string['inputs']
voice_preset = string['voice_preset']
else:
raise TypeError(f"Not support input type:{type(string)}, requires str or dict.")
inputs = self.processor(query, voice_preset=voice_preset).to(self.device)
speech = self.bark.generate(**inputs) * 32767
res = {'sounds': (
self.bark.generation_config.sample_rate,
speech.cpu().numpy().squeeze().tolist()
)}
return json.dumps(res)


class BarkDeploy(object):
keys_name_handle = {
'inputs': 'inputs',
}
message_format = {
'inputs': 'Who are you ?',
'voice_preset': None,
}
default_headers = {'Content-Type': 'application/json'}

def __init__(self, launcher=None):
self.launcher = launcher

def __call__(self, finetuned_model=None, base_model=None):
if not os.path.exists(finetuned_model) or \
not any(filename.endswith('.bin') or filename.endswith('.safetensors')
for _, _, filename in os.walk(finetuned_model) if filename):
if not finetuned_model:
LOG.warning(f"Note! That finetuned_model({finetuned_model}) is an invalid path, "
f"base_model({base_model}) will be used")
finetuned_model = base_model
return lazyllm.deploy.RelayServer(func=Bark(finetuned_model), launcher=self.launcher)()
6 changes: 5 additions & 1 deletion lazyllm/components/utils/downloader/model_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,11 @@
"stable-diffusion-3-medium": {
"source": {"huggingface": "stabilityai/stable-diffusion-3-medium", "modelscope": "AI-ModelScope/stable-diffusion-3-medium-diffusers"},
"type": "sd",
},
},
"bark":{
"source": {"huggingface": "suno/bark", "modelscope": "mapjack/bark"},
"type": "tts",
},
"llava-1.5-7b": {
"source": {"huggingface": "llava-hf/llava-1.5-7b-hf", "modelscope": "huangjintao/llava-1.5-7b-hf"},
"type": "vlm",
Expand Down
21 changes: 16 additions & 5 deletions lazyllm/tools/webpages/webmodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from PIL import Image
from io import BytesIO
from types import GeneratorType
import numpy as np

import lazyllm
from lazyllm import LOG, globals
Expand Down Expand Up @@ -93,6 +94,9 @@ def init_web(self, component_descs):
for _, gname, name, ctype, value in component_descs:
if ctype in ('Checkbox', 'Text'):
components.append(getattr(gr, ctype)(interactive=True, value=value, label=f'{gname}.{name}'))
elif ctype == 'Dropdown':
components.append(getattr(gr, ctype)(interactive=True, choices=value,
label=f'{gname}.{name}'))
else:
raise KeyError(f'invalid component type: {ctype}')
with gr.Row():
Expand Down Expand Up @@ -262,7 +266,11 @@ def get_log_and_message(s):
elif 'images_base64' in r:
image_data = r.pop('images_base64')[0]
image = Image.open(BytesIO(base64.b64decode(image_data)))
return "The image is: ", "".join(log_history), image
return "The image is: ", "".join(log_history), {'img': image}
elif 'sounds' in r:
sound_data = r.pop('sounds')
sound_data = (sound_data[0], np.array(sound_data[1]).astype(np.int16))
return "The Audio is: ", "".join(log_history), {'audio': sound_data}
else:
s = s
except (ValueError, KeyError, TypeError):
Expand All @@ -272,11 +280,14 @@ def get_log_and_message(s):
return s, "".join(log_history), None

log_history = []
image = None
file = None
if isinstance(result, (str, dict)):
result, log, image = get_log_and_message(result)
if image:
chat_history[-1][1] = gr.Image(image)
result, log, file = get_log_and_message(result)
if file:
if 'img' in file:
chat_history[-1][1] = gr.Image(file['img'])
if 'audio' in file:
chat_history[-1][1] = gr.Audio(file['audio'])
elif isinstance(result, str):
chat_history[-1][1] = result
elif isinstance(result, GeneratorType):
Expand Down
6 changes: 6 additions & 0 deletions tests/advanced_tests/test_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ def test_sd3(self):
res = m('a little cat')
assert "images_base64" in json.loads(res)

def test_bark(self):
m = lazyllm.TrainableModule('bark')
m.update_server()
res = m('你好啊,很高兴认识你。')
assert "sounds" in json.loads(res)

def test_vlm_and_lmdeploy(self):
chat = lazyllm.TrainableModule('internvl-chat-2b-v1-5').deploy_method(deploy.LMDeploy)
m = lazyllm.ServerModule(chat)
Expand Down