-
-
Notifications
You must be signed in to change notification settings - Fork 14
/
Pipes.py
303 lines (294 loc) · 14.8 KB
/
Pipes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
import logging
from dotenv import load_dotenv
from ezlocalai.LLM import LLM, is_vision_model
from ezlocalai.STT import STT
from ezlocalai.CTTS import CTTS
from ezlocalai.Embedding import Embedding
from pyngrok import ngrok
import requests
import base64
import pdfplumber
import torch
from Globals import getenv
try:
from ezlocalai.IMG import IMG
img_import_success = True
except ImportError:
img_import_success = False
from ezlocalai.VLM import VLM
class Pipes:
def __init__(self):
load_dotenv()
global img_import_success
self.current_llm = getenv("DEFAULT_MODEL")
self.current_vlm = getenv("VISION_MODEL")
self.llm = None
self.vlm = None
self.ctts = None
self.stt = None
self.embedder = None
if self.current_llm.lower() != "none":
logging.info(f"[LLM] {self.current_llm} model loading. Please wait...")
self.llm = LLM(model=self.current_llm)
logging.info(f"[LLM] {self.current_llm} model loaded successfully.")
if getenv("EMBEDDING_ENABLED").lower() == "true":
self.embedder = Embedding()
if self.current_vlm != "":
logging.info(f"[VLM] {self.current_vlm} model loading. Please wait...")
try:
self.vlm = VLM(model=self.current_vlm)
except Exception as e:
logging.error(f"[VLM] Failed to load the model: {e}")
self.vlm = None
if self.vlm is not None:
logging.info(f"[ezlocalai] Vision is enabled with {self.current_vlm}.")
if getenv("TTS_ENABLED").lower() == "true":
logging.info(f"[CTTS] xttsv2_2.0.2 model loading. Please wait...")
self.ctts = CTTS()
logging.info(f"[CTTS] xttsv2_2.0.2 model loaded successfully.")
if getenv("STT_ENABLED").lower() == "true":
self.current_stt = getenv("WHISPER_MODEL")
logging.info(f"[STT] {self.current_stt} model loading. Please wait...")
self.stt = STT(model=self.current_stt)
logging.info(f"[STT] {self.current_stt} model loaded successfully.")
if is_vision_model(self.current_llm):
if self.vlm is None:
self.vlm = self.llm
if self.current_llm == "none" and self.vlm is not None:
self.llm = self.vlm
NGROK_TOKEN = getenv("NGROK_TOKEN")
if NGROK_TOKEN:
ngrok.set_auth_token(NGROK_TOKEN)
public_url = ngrok.connect(8091)
logging.info(f"[ngrok] Public Tunnel: {public_url.public_url}")
self.local_uri = public_url.public_url
else:
self.local_uri = getenv("EZLOCALAI_URL")
self.img_enabled = getenv("IMG_ENABLED").lower() == "true"
self.img = None
if img_import_success:
logging.info(f"[IMG] Image generation is enabled.")
SD_MODEL = getenv("SD_MODEL") # stabilityai/sdxl-turbo
if SD_MODEL:
logging.info(f"[IMG] {SD_MODEL} model loading. Please wait...")
img_device = getenv("IMG_DEVICE")
try:
self.img = IMG(
model=SD_MODEL, local_uri=self.local_uri, device=img_device
)
except Exception as e:
logging.error(f"[IMG] Failed to load the model: {e}")
self.img = None
logging.info(f"[IMG] {SD_MODEL} model loaded successfully.")
async def pdf_to_audio(self, title, voice, pdf, chunk_size=200):
filename = f"{title}.pdf"
file_path = os.path.join(os.getcwd(), "outputs", filename)
pdf = pdf.split(",")[1]
pdf = base64.b64decode(pdf)
with open(file_path, "wb") as pdf_file:
pdf_file.write(pdf)
content = ""
if file_path.endswith(".pdf"):
with pdfplumber.open(file_path) as pdf:
content = "\n".join([page.extract_text() for page in pdf.pages])
if not content:
return
return await self.ctts.generate(
text=content,
voice=voice,
local_uri=self.local_uri,
output_file_name=f"{title}.wav",
)
async def audio_to_audio(self, voice, audio):
audio_type = audio.split(",")[0].split(":")[1].split(";")[0]
audio_format = audio_type.split("/")[1]
audio = audio.split(",")[1]
audio = base64.b64decode(audio)
text = self.stt.transcribe_audio(base64_audio=audio, audio_format=audio_format)
return await self.ctts.generate(
text=text, voice=voice, local_uri=self.local_uri
)
async def generate_image(self, prompt, response_format="url", size="512x512"):
if self.img:
self.img.local_uri = self.local_uri if response_format == "url" else None
new_image = self.img.generate(
prompt=prompt,
size=size,
)
self.img.local_uri = self.local_uri
return new_image
return ""
async def get_response(self, data, completion_type="chat"):
data["local_uri"] = self.local_uri
images = []
if "messages" in data:
if isinstance(data["messages"][-1]["content"], list):
messages = data["messages"][-1]["content"]
for message in messages:
if "text" in message:
prompt = message["text"]
for message in messages:
if "image_url" in message:
images.append(message)
if "audio_url" in message:
audio_url = (
message["audio_url"]["url"]
if "url" in message["audio_url"]
else message["audio_url"]
)
audio_format = "wav"
if audio_url.startswith("data:"):
audio_url = audio_url.split(",")[1]
audio_format = audio_url.split(";")[0]
else:
audio_url = requests.get(audio_url).content
audio_url = base64.b64encode(audio_url).decode("utf-8")
transcribed_audio = self.stt.transcribe_audio(
base64_audio=audio_url, audio_format=audio_format
)
prompt = f"Transcribed Audio: {transcribed_audio}\n\n{prompt}"
if data["model"]:
if self.current_llm != data["model"]:
data["model"] = self.current_llm
if "stop" in data:
new_stop = self.llm.params["stop"]
new_stop.append(data["stop"])
data["stop"] = new_stop
if "audio_format" in data:
base64_audio = (
data["messages"][-1]["content"]
if completion_type == "chat"
else data["prompt"]
)
prompt = await self.stt.transcribe_audio(
base64_audio=base64_audio,
audio_format=data["audio_format"],
)
if completion_type == "chat":
data["messages"][-1]["content"] = prompt
else:
data["prompt"] = prompt
user_message = (
data["messages"][-1]["content"]
if completion_type == "chat"
else data["prompt"]
)
if self.vlm and images:
new_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe each stage of this image.",
},
],
}
]
new_messages[0]["content"].extend(images)
image_description = self.vlm.chat(messages=new_messages)
print(
f"Image Description: {image_description['choices'][0]['message']['content']}"
)
prompt = (
f"\n\nSee the uploaded image description for any questions about the uploaded image. Act as if you can see the image based on the description. Do not mention 'uploaded image description' in response. Uploaded Image Description: {image_description['choices'][0]['message']['content']}\n\n{data['messages'][-1]['content'][0]['text']}"
if completion_type == "chat"
else f"\n\nSee the uploaded image description for any questions about the uploaded image. Act as if you can see the image based on the description. Do not mention 'uploaded image description' in response. Uploaded Image Description: {image_description['choices'][0]['message']['content']}\n\n{data['prompt']}"
)
print(f"Full Prompt: {prompt}")
if completion_type == "chat":
data["messages"][-1]["content"] = prompt
else:
data["prompt"] = prompt
if completion_type == "chat":
response = self.llm.chat(**data)
else:
response = self.llm.completion(**data)
generated_image = None
if "temperature" not in data:
data["temperature"] = 0.5
if "top_p" not in data:
data["top_p"] = 0.9
if self.img_enabled and img_import_success and self.img:
user_message = (
data["messages"][-1]["content"]
if completion_type == "chat"
else data["prompt"]
)
if isinstance(user_message, list):
user_message = prompt
for message in messages:
if "image_url" in message:
if "url" in message["image_url"]:
if not message["image_url"]["url"].startswith("data:"):
user_message += (
"Uploaded Image:"
+ message["image_url"]["url"]
+ "\n"
)
response_text = (
response["choices"][0]["text"]
if completion_type != "chat"
else response["choices"][0]["message"]["content"]
)
if "data:" in user_message:
user_message = user_message.replace(
user_message.split("data:")[1].split("'")[0], ""
)
img_gen_prompt = f"Users message: {user_message} \n\n{'The user uploaded an image, one does not need generated unless the user is specifically asking.' if images else ''} **The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nWould the user potentially like an image generated based on their message?\nAssistant: "
logging.info(f"[IMG] Decision maker prompt: {img_gen_prompt}")
create_img = self.llm.chat(
messages=[{"role": "system", "content": img_gen_prompt}],
max_tokens=10,
temperature=data["temperature"],
top_p=data["top_p"],
)
create_img = str(create_img["choices"][0]["message"]["content"]).lower()
logging.info(f"[IMG] Decision maker response: {create_img}")
if "yes" in create_img or "es," in create_img:
img_prompt = f"**The assistant is acting as a Stable Diffusion Prompt Generator.**\n\nUsers message: {user_message} \nAssistant response: {response_text} \n\nImportant rules to follow:\n- Describe subjects in detail, specify image type (e.g., digital illustration), art style (e.g., steampunk), and background. Include art inspirations (e.g., Art Station, specific artists). Detail lighting, camera (type, lens, view), and render (resolution, style). The weight of a keyword can be adjusted by using the syntax (((keyword))) , put only those keyword inside ((())) which is very important because it will have more impact so anything wrong will result in unwanted picture so be careful. Realistic prompts: exclude artist, specify lens. Separate with double lines. Max 60 words, avoiding 'real' for fantastical.\n- Based on the message from the user and response of the assistant, you will need to generate one detailed stable diffusion image generation prompt based on the context of the conversation to accompany the assistant response.\n- The prompt can only be up to 60 words long, so try to be concise while using enough descriptive words to make a proper prompt.\n- Following all rules will result in a $2000 tip that you can spend on anything!\n- Must be in markdown code block to be parsed out and only provide prompt in the code block, nothing else.\nStable Diffusion Prompt Generator: "
image_generation_prompt = self.llm.chat(
messages=[{"role": "system", "content": img_prompt}],
max_tokens=100,
temperature=data["temperature"],
top_p=data["top_p"],
)
image_generation_prompt = str(
image_generation_prompt["choices"][0]["message"]["content"]
)
logging.info(
f"[IMG] Image generation response: {image_generation_prompt}"
)
if "```markdown" in image_generation_prompt:
image_generation_prompt = image_generation_prompt.split(
"```markdown"
)[1]
image_generation_prompt = image_generation_prompt.split("```")[0]
generated_image = self.img.generate(prompt=image_generation_prompt)
audio_response = None
if "voice" in data:
text_response = (
response["choices"][0]["text"]
if completion_type != "chat"
else response["choices"][0]["message"]["content"]
)
language = data["language"] if "language" in data else "en"
audio_response = await self.ctts.generate(
text=text_response,
voice=data["voice"],
language=language,
local_uri=self.local_uri,
)
if completion_type != "chat":
response["choices"][0]["text"] = f"{text_response}\n{audio_response}"
else:
response["choices"][0]["message"][
"content"
] = f"{text_response}\n{audio_response}"
if generated_image:
if completion_type != "chat":
response["choices"][0]["text"] += f"\n\n{generated_image}"
else:
response["choices"][0]["message"]["content"] += f"\n\n{generated_image}"
return response, audio_response