Merge pull request #33 from DevXT-LLC/Add-basic-ui

Added basic streamlit UI to test inference
DevXT-LLC · Mar 19, 2024 · 5b8ca7e · 5b8ca7e
2 parents 51e135e + 40f9add
commit 5b8ca7e
Show file tree

Hide file tree

Showing 4 changed files with 183 additions and 38 deletions.
diff --git a/Pipes.py b/Pipes.py
@@ -73,13 +73,16 @@ def __init__(self):
 
     async def get_response(self, data, completion_type="chat"):
         data["local_uri"] = self.local_uri
+        images_uploaded = False
         if "messages" in data:
             if isinstance(data["messages"][-1]["content"], list):
                 messages = data["messages"][-1]["content"]
                 for message in messages:
                     if "text" in message:
                         prompt = message["text"]
                 for message in messages:
+                    if "image_url" in message:
+                        images_uploaded = True
                     if "audio_url" in message:
                         audio_url = (
                             message["audio_url"]["url"]
@@ -133,12 +136,27 @@ async def get_response(self, data, completion_type="chat"):
                 if completion_type == "chat"
                 else data["prompt"]
             )
+            if isinstance(user_message, list):
+                user_message = prompt
+                for message in messages:
+                    if "image_url" in message:
+                        if "url" in message["image_url"]:
+                            if not message["image_url"]["url"].startswith("data:"):
+                                user_message += (
+                                    "Uploaded Image:"
+                                    + message["image_url"]["url"]
+                                    + "\n"
+                                )
             response_text = (
                 response["choices"][0]["text"]
                 if completion_type != "chat"
                 else response["choices"][0]["message"]["content"]
             )
-            img_gen_prompt = f"Users message: {user_message} \nAssistant response: {response_text} \n\n**The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nShould an image be created to accompany the assistant response?\nAssistant: "
+            if "data:" in user_message:
+                user_message = user_message.replace(
+                    user_message.split("data:")[1].split("'")[0], ""
+                )
+            img_gen_prompt = f"Users message: {user_message} \n\n{'The user uploaded an image, one does not need generated unless the user is specifically asking.' if images_uploaded else ''} **The assistant is acting as sentiment analysis expert and only responds with a concise YES or NO answer on if the user would like an image as visual or a picture generated. No other explanation is needed!**\nWould the user potentially like an image generated based on their message?\nAssistant: "
             logging.info(f"[IMG] Decision maker prompt: {img_gen_prompt}")
             create_img = self.llm.chat(
                 messages=[{"role": "system", "content": img_gen_prompt}],
@@ -149,12 +167,7 @@ async def get_response(self, data, completion_type="chat"):
             create_img = str(create_img["choices"][0]["message"]["content"]).lower()
             logging.info(f"[IMG] Decision maker response: {create_img}")
             if "yes" in create_img or "es," in create_img:
-                prompt = (
-                    data["messages"][-1]["content"]
-                    if completion_type == "chat"
-                    else data["prompt"]
-                )
-                img_prompt = f"**The assistant is acting as a Stable Diffusion Prompt Generator.**\n\nUsers message: {prompt} \nAssistant response: {response} \n\nImportant rules to follow:\n- Describe subjects in detail, specify image type (e.g., digital illustration), art style (e.g., steampunk), and background. Include art inspirations (e.g., Art Station, specific artists). Detail lighting, camera (type, lens, view), and render (resolution, style). The weight of a keyword can be adjusted by using the syntax (((keyword))) , put only those keyword inside ((())) which is very important because it will have more impact so anything wrong will result in unwanted picture so be careful. Realistic prompts: exclude artist, specify lens. Separate with double lines. Max 60 words, avoiding 'real' for fantastical.\n- Based on the message from the user and response of the assistant, you will need to generate one detailed stable diffusion image generation prompt based on the context of the conversation to accompany the assistant response.\n- The prompt can only be up to 60 words long, so try to be concise while using enough descriptive words to make a proper prompt.\n- Following all rules will result in a $2000 tip that you can spend on anything!\n- Must be in markdown code block to be parsed out and only provide prompt in the code block, nothing else.\nStable Diffusion Prompt Generator: "
+                img_prompt = f"**The assistant is acting as a Stable Diffusion Prompt Generator.**\n\nUsers message: {user_message} \nAssistant response: {response_text} \n\nImportant rules to follow:\n- Describe subjects in detail, specify image type (e.g., digital illustration), art style (e.g., steampunk), and background. Include art inspirations (e.g., Art Station, specific artists). Detail lighting, camera (type, lens, view), and render (resolution, style). The weight of a keyword can be adjusted by using the syntax (((keyword))) , put only those keyword inside ((())) which is very important because it will have more impact so anything wrong will result in unwanted picture so be careful. Realistic prompts: exclude artist, specify lens. Separate with double lines. Max 60 words, avoiding 'real' for fantastical.\n- Based on the message from the user and response of the assistant, you will need to generate one detailed stable diffusion image generation prompt based on the context of the conversation to accompany the assistant response.\n- The prompt can only be up to 60 words long, so try to be concise while using enough descriptive words to make a proper prompt.\n- Following all rules will result in a $2000 tip that you can spend on anything!\n- Must be in markdown code block to be parsed out and only provide prompt in the code block, nothing else.\nStable Diffusion Prompt Generator: "
                 image_generation_prompt = self.llm.chat(
                     messages=[{"role": "system", "content": img_prompt}],
                     max_tokens=100,

diff --git a/ezlocalai/VLM.py b/ezlocalai/VLM.py
@@ -10,6 +10,7 @@
 import uuid
 import tiktoken
 import os
+import base64
 
 
 def get_tokens(text: str) -> int:
@@ -40,30 +41,20 @@ def __init__(self, model="deepseek-ai/deepseek-vl-1.3b-chat"):
     def chat(self, messages, **kwargs):
         pil_images = []
         images = []
-        conversation = []
+        prompt = ""
         for message in messages:
             if isinstance(message["content"], str):
                 role = message["role"] if "role" in message else "User"
                 if role.lower() == "user":
-                    role = "User"
-                conversation.append(
-                    {
-                        "role": role,
-                        "content": message["content"],
-                    }
-                )
+                    prompt += f"{message['content']}\n\n"
+                if role.lower() == "system":
+                    prompt = f"System: {message['content']}\n\nUser: {prompt}"
             if isinstance(message["content"], list):
                 for msg in message["content"]:
                     if "text" in msg:
                         role = message["role"] if "role" in message else "User"
                         if role.lower() == "user":
-                            role = "User"
-                        conversation.append(
-                            {
-                                "role": role,
-                                "content": "<image_placeholder>" + msg["text"],
-                            }
-                        )
+                            prompt += f"{msg['text']}\n\n"
                     if "image_url" in msg:
                         url = (
                             msg["image_url"]["url"]
@@ -73,25 +64,25 @@ def chat(self, messages, **kwargs):
                         image_path = f"./outputs/{uuid.uuid4().hex}.jpg"
                         if url.startswith("http"):
                             image = requests.get(url).content
-                            with open(image_path, "wb") as f:
-                                f.write(image)
-                            images.append(image_path)
                         else:
-                            with open(image_path, "wb") as f:
-                                f.write(url)
-                            images.append(image_path)
+                            file_type = url.split(",")[0].split("/")[1].split(";")[0]
+                            if file_type == "jpeg":
+                                file_type = "jpg"
+                            image_path = f"./outputs/{uuid.uuid4().hex}.{file_type}"
+                            image = base64.b64decode(url.split(",")[1])
+                        with open(image_path, "wb") as f:
+                            f.write(image)
+                        images.append(image_path)
                         pil_img = PIL.Image.open(image_path)
                         pil_img = pil_img.convert("RGB")
                         pil_images.append(pil_img)
-        if conversation == []:
-            conversation.append(
-                {
-                    "role": "User",
-                    "content": messages[0]["content"],
-                }
-            )
-        conversation[0]["images"] = images
-        conversation.append({"role": "Assistant", "content": ""})
+        if len(images) > 0:
+            for image in images:
+                prompt = f"<image_placeholder> {prompt}"
+        conversation = [
+            {"role": "User", "content": prompt, "images": images},
+            {"role": "Assistant", "content": ""},
+        ]
         prepare_inputs = self.vl_chat_processor(
             conversations=conversation, images=pil_images, force_batchify=True
         ).to(self.vl_gpt.device)

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="ezlocalai",
-    version="0.1.6",
+    version="0.1.7",
     description="ezlocalai is an easy to set up local artificial intelligence server with OpenAI Style Endpoints.",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/ui.py b/ui.py
@@ -0,0 +1,141 @@
+import streamlit as st
+import openai
+import requests
+import time
+import base64
+import os
+import re
+from datetime import datetime
+from dotenv import load_dotenv
+
+load_dotenv()
+st.title("ezLocalai")
+
+EZLOCALAI_SERVER = os.getenv("EZLOCALAI_URL", "http://localhost:8091")
+EZLOCALAI_API_KEY = os.getenv("EZLOCALAI_API_KEY", "none")
+DEFAULT_LLM = os.getenv("DEFAULT_LLM", "phi-2-dpo")
+openai.base_url = f"{EZLOCALAI_SERVER}/v1/"
+openai.api_key = EZLOCALAI_API_KEY if EZLOCALAI_API_KEY else EZLOCALAI_SERVER
+HEADERS = {
+    "Content-Type": "application/json",
+    "Authorization": f"{EZLOCALAI_API_KEY}",
+    "ngrok-skip-browser-warning": "true",
+}
+
+
+def get_voices():
+    global EZLOCALAI_SERVER
+    global HEADERS
+    voices = requests.get(f"{EZLOCALAI_SERVER}/v1/audio/voices", headers=HEADERS)
+    return voices.json()
+
+
+waiting_for_server = False
+
+while True:
+    try:
+        voices = get_voices()
+        break
+    except:
+        if waiting_for_server == False:
+            st.spinner("Waiting for server to start...")
+        waiting_for_server = True
+        time.sleep(1)
+waiting_for_server = False
+
+
+def display_content(content):
+    global EZLOCALAI_SERVER
+    global HEADERS
+    outputs_url = f"{EZLOCALAI_SERVER}/outputs/"
+    os.makedirs("outputs", exist_ok=True)
+    if "http://localhost:8091/outputs/" in content:
+        if outputs_url != "http://localhost:8091/outputs/":
+            content = content.replace("http://localhost:8091/outputs/", outputs_url)
+    if "<audio controls>" in content or " " not in content:
+        try:
+            audio_response = content.split("data:audio/wav;base64,")[1].split('" type')[
+                0
+            ]
+        except:
+            audio_response = content
+        file_name = f"outputs/{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.wav"
+        with open(file_name, "wb") as fh:
+            fh.write(base64.b64decode(audio_response))
+        st.audio(file_name, format="audio/wav", start_time=0)
+    if outputs_url in content:
+        urls = re.findall(f"{re.escape(outputs_url)}[^\"' ]+", content)
+        urls = urls[0].split("\n\n")
+        for url in urls:
+            file_name = url.split("/")[-1]
+            url = f"{outputs_url}{file_name}"
+            data = requests.get(url, headers=HEADERS).content
+            if url.endswith(".jpg") or url.endswith(".png"):
+                content = content.replace(url, "")
+                st.image(data, use_column_width=True)
+            elif url.endswith(".mp4"):
+                content = content.replace(url, "")
+                st.audio(data, format="audio/mp4", start_time=0)
+            elif url.endswith(".wav"):
+                content = content.replace(url, "")
+                st.audio(data, format="audio/wav", start_time=0)
+    st.markdown(content, unsafe_allow_html=True)
+
+
+with st.form("chat"):
+    SYSTEM_MESSAGE = st.text_area(
+        "System Prompt",
+        "The assistant is acting as a creative writer. All of your text responses are transcribed to audio and sent to the user. Be concise with all responses. After the request is fulfilled, end with </s>.",
+    )
+    DEFAULT_MAX_TOKENS = st.number_input(
+        "Max Output Tokens", min_value=10, max_value=300000, value=256
+    )
+    DEFAULT_TEMPERATURE = st.number_input(
+        "Temperature", min_value=0.0, max_value=1.0, value=0.5
+    )
+    DEFAULT_TOP_P = st.number_input("Top P", min_value=0.0, max_value=1.0, value=0.9)
+    voice_drop_down = st.selectbox(
+        "Text-to-Speech Response Voice", ["None"] + voices["voices"], index=0
+    )
+    uploaded_file = st.file_uploader("Upload an image")
+    prompt = st.text_area("Your Message:", "Describe each stage of this image.")
+    send = st.form_submit_button("Send")
+    if prompt != "" and send:
+        st.markdown("---")
+        st.spinner("Thinking...")
+        messages = []
+        if SYSTEM_MESSAGE != "":
+            messages.append({"role": "system", "content": SYSTEM_MESSAGE})
+        if uploaded_file:
+            messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": f"{uploaded_file.type.split('/')[0]}_url",
+                            f"{uploaded_file.type.split('/')[0]}_url": {
+                                "url": f"data:{uploaded_file.type};base64,{base64.b64encode(uploaded_file.read()).decode('utf-8')}",
+                            },
+                        },
+                    ],
+                },
+            )
+            if uploaded_file.type.startswith("image"):
+                st.image(uploaded_file, use_column_width=True)
+        if messages == []:
+            messages = [
+                {"role": "user", "content": prompt},
+            ]
+        extra_body = {} if voice_drop_down == "None" else {"voice": voice_drop_down}
+        response = openai.chat.completions.create(
+            model=DEFAULT_LLM,
+            messages=messages,
+            temperature=DEFAULT_TEMPERATURE,
+            max_tokens=DEFAULT_MAX_TOKENS,
+            top_p=DEFAULT_TOP_P,
+            stream=False,
+            extra_body=extra_body,
+        )
+        display_content(response.choices[0].message.content)
+        st.balloons()