diff --git a/Pipes.py b/Pipes.py index f4c4ca0..1cb6968 100644 --- a/Pipes.py +++ b/Pipes.py @@ -73,6 +73,7 @@ def __init__(self): async def get_response(self, data, completion_type="chat"): data["local_uri"] = self.local_uri + images_uploaded = False if "messages" in data: if isinstance(data["messages"][-1]["content"], list): messages = data["messages"][-1]["content"] @@ -80,6 +81,8 @@ async def get_response(self, data, completion_type="chat"): if "text" in message: prompt = message["text"] for message in messages: + if "image_url" in message: + images_uploaded = True if "audio_url" in message: audio_url = ( message["audio_url"]["url"] @@ -133,12 +136,27 @@ async def get_response(self, data, completion_type="chat"): if completion_type == "chat" else data["prompt"] ) + if isinstance(user_message, list): + user_message = prompt + for message in messages: + if "image_url" in message: + if "url" in message["image_url"]: + if not message["image_url"]["url"].startswith("data:"): + user_message += ( + "Uploaded Image:" + + message["image_url"]["url"] + + "\n" + ) response_text = ( response["choices"][0]["text"] if completion_type != "chat" else response["choices"][0]["message"]["content"] ) - img_gen_prompt = f"Users message: {user_message} \nAssistant response: {response_text} \n\n**The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nShould an image be created to accompany the assistant response?\nAssistant: " + if "data:" in user_message: + user_message = user_message.replace( + user_message.split("data:")[1].split("'")[0], "" + ) + img_gen_prompt = f"Users message: {user_message} \n\n{'The user uploaded an image, one does not need generated unless the user is specifically asking.' if images_uploaded else ''} **The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nWould the user potentially like an image generated based on their message?\nAssistant: " logging.info(f"[IMG] Decision maker prompt: {img_gen_prompt}") create_img = self.llm.chat( messages=[{"role": "system", "content": img_gen_prompt}], @@ -149,12 +167,7 @@ async def get_response(self, data, completion_type="chat"): create_img = str(create_img["choices"][0]["message"]["content"]).lower() logging.info(f"[IMG] Decision maker response: {create_img}") if "yes" in create_img or "es," in create_img: - prompt = ( - data["messages"][-1]["content"] - if completion_type == "chat" - else data["prompt"] - ) - img_prompt = f"**The assistant is acting as a Stable Diffusion Prompt Generator.**\n\nUsers message: {prompt} \nAssistant response: {response} \n\nImportant rules to follow:\n- Describe subjects in detail, specify image type (e.g., digital illustration), art style (e.g., steampunk), and background. Include art inspirations (e.g., Art Station, specific artists). Detail lighting, camera (type, lens, view), and render (resolution, style). The weight of a keyword can be adjusted by using the syntax (((keyword))) , put only those keyword inside ((())) which is very important because it will have more impact so anything wrong will result in unwanted picture so be careful. Realistic prompts: exclude artist, specify lens. Separate with double lines. Max 60 words, avoiding 'real' for fantastical.\n- Based on the message from the user and response of the assistant, you will need to generate one detailed stable diffusion image generation prompt based on the context of the conversation to accompany the assistant response.\n- The prompt can only be up to 60 words long, so try to be concise while using enough descriptive words to make a proper prompt.\n- Following all rules will result in a $2000 tip that you can spend on anything!\n- Must be in markdown code block to be parsed out and only provide prompt in the code block, nothing else.\nStable Diffusion Prompt Generator: " + img_prompt = f"**The assistant is acting as a Stable Diffusion Prompt Generator.**\n\nUsers message: {user_message} \nAssistant response: {response_text} \n\nImportant rules to follow:\n- Describe subjects in detail, specify image type (e.g., digital illustration), art style (e.g., steampunk), and background. Include art inspirations (e.g., Art Station, specific artists). Detail lighting, camera (type, lens, view), and render (resolution, style). The weight of a keyword can be adjusted by using the syntax (((keyword))) , put only those keyword inside ((())) which is very important because it will have more impact so anything wrong will result in unwanted picture so be careful. Realistic prompts: exclude artist, specify lens. Separate with double lines. Max 60 words, avoiding 'real' for fantastical.\n- Based on the message from the user and response of the assistant, you will need to generate one detailed stable diffusion image generation prompt based on the context of the conversation to accompany the assistant response.\n- The prompt can only be up to 60 words long, so try to be concise while using enough descriptive words to make a proper prompt.\n- Following all rules will result in a $2000 tip that you can spend on anything!\n- Must be in markdown code block to be parsed out and only provide prompt in the code block, nothing else.\nStable Diffusion Prompt Generator: " image_generation_prompt = self.llm.chat( messages=[{"role": "system", "content": img_prompt}], max_tokens=100, diff --git a/ezlocalai/VLM.py b/ezlocalai/VLM.py index 5b3ec1c..6dd50b1 100644 --- a/ezlocalai/VLM.py +++ b/ezlocalai/VLM.py @@ -10,6 +10,7 @@ import uuid import tiktoken import os +import base64 def get_tokens(text: str) -> int: @@ -40,30 +41,20 @@ def __init__(self, model="deepseek-ai/deepseek-vl-1.3b-chat"): def chat(self, messages, **kwargs): pil_images = [] images = [] - conversation = [] + prompt = "" for message in messages: if isinstance(message["content"], str): role = message["role"] if "role" in message else "User" if role.lower() == "user": - role = "User" - conversation.append( - { - "role": role, - "content": message["content"], - } - ) + prompt += f"{message['content']}\n\n" + if role.lower() == "system": + prompt = f"System: {message['content']}\n\nUser: {prompt}" if isinstance(message["content"], list): for msg in message["content"]: if "text" in msg: role = message["role"] if "role" in message else "User" if role.lower() == "user": - role = "User" - conversation.append( - { - "role": role, - "content": "" + msg["text"], - } - ) + prompt += f"{msg['text']}\n\n" if "image_url" in msg: url = ( msg["image_url"]["url"] @@ -73,25 +64,25 @@ def chat(self, messages, **kwargs): image_path = f"./outputs/{uuid.uuid4().hex}.jpg" if url.startswith("http"): image = requests.get(url).content - with open(image_path, "wb") as f: - f.write(image) - images.append(image_path) else: - with open(image_path, "wb") as f: - f.write(url) - images.append(image_path) + file_type = url.split(",")[0].split("/")[1].split(";")[0] + if file_type == "jpeg": + file_type = "jpg" + image_path = f"./outputs/{uuid.uuid4().hex}.{file_type}" + image = base64.b64decode(url.split(",")[1]) + with open(image_path, "wb") as f: + f.write(image) + images.append(image_path) pil_img = PIL.Image.open(image_path) pil_img = pil_img.convert("RGB") pil_images.append(pil_img) - if conversation == []: - conversation.append( - { - "role": "User", - "content": messages[0]["content"], - } - ) - conversation[0]["images"] = images - conversation.append({"role": "Assistant", "content": ""}) + if len(images) > 0: + for image in images: + prompt = f" {prompt}" + conversation = [ + {"role": "User", "content": prompt, "images": images}, + {"role": "Assistant", "content": ""}, + ] prepare_inputs = self.vl_chat_processor( conversations=conversation, images=pil_images, force_batchify=True ).to(self.vl_gpt.device) diff --git a/setup.py b/setup.py index 19fccb5..32b93e9 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="ezlocalai", - version="0.1.6", + version="0.1.7", description="ezlocalai is an easy to set up local artificial intelligence server with OpenAI Style Endpoints.", long_description=long_description, long_description_content_type="text/markdown", diff --git a/ui.py b/ui.py new file mode 100644 index 0000000..ba9a517 --- /dev/null +++ b/ui.py @@ -0,0 +1,141 @@ +import streamlit as st +import openai +import requests +import time +import base64 +import os +import re +from datetime import datetime +from dotenv import load_dotenv + +load_dotenv() +st.title("ezLocalai") + +EZLOCALAI_SERVER = os.getenv("EZLOCALAI_URL", "http://localhost:8091") +EZLOCALAI_API_KEY = os.getenv("EZLOCALAI_API_KEY", "none") +DEFAULT_LLM = os.getenv("DEFAULT_LLM", "phi-2-dpo") +openai.base_url = f"{EZLOCALAI_SERVER}/v1/" +openai.api_key = EZLOCALAI_API_KEY if EZLOCALAI_API_KEY else EZLOCALAI_SERVER +HEADERS = { + "Content-Type": "application/json", + "Authorization": f"{EZLOCALAI_API_KEY}", + "ngrok-skip-browser-warning": "true", +} + + +def get_voices(): + global EZLOCALAI_SERVER + global HEADERS + voices = requests.get(f"{EZLOCALAI_SERVER}/v1/audio/voices", headers=HEADERS) + return voices.json() + + +waiting_for_server = False + +while True: + try: + voices = get_voices() + break + except: + if waiting_for_server == False: + st.spinner("Waiting for server to start...") + waiting_for_server = True + time.sleep(1) +waiting_for_server = False + + +def display_content(content): + global EZLOCALAI_SERVER + global HEADERS + outputs_url = f"{EZLOCALAI_SERVER}/outputs/" + os.makedirs("outputs", exist_ok=True) + if "http://localhost:8091/outputs/" in content: + if outputs_url != "http://localhost:8091/outputs/": + content = content.replace("http://localhost:8091/outputs/", outputs_url) + if "