Skip to content

Commit

Permalink
restore vlm
Browse files Browse the repository at this point in the history
  • Loading branch information
Josh-XT committed Feb 11, 2025
1 parent 23c0fd8 commit f16e16b
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 10 deletions.
3 changes: 2 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ MAIN_GPU=0
NGROK_TOKEN=
EZLOCALAI_API_KEY=
EZLOCALAI_URL=http://localhost:8091
DEFAULT_MODEL=lmstudio-community/Qwen2-VL-7B-Instruct-GGUF
DEFAULT_MODEL=bartowski/Qwen2.5-3B-Instruct-GGUF
VISION_MODEL=deepseek-ai/deepseek-vl-1.3b-chat
IMG_ENABLED=false
IMG_DEVICE=cpu
SD_MODEL=stabilityai/sdxl-turbo
Expand Down
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ WORKDIR /app
COPY requirements.txt .
RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked \
python3 -m pip install --no-cache-dir -r requirements.txt && \
git clone https://github.com/Josh-XT/DeepSeek-VL deepseek && \
cd deepseek && \
pip install --no-cache-dir -e . && \
cd .. && \
pip uninstall -y torch torchvision torchaudio && \
pip install torch==2.3.1 torchaudio==2.3.1 torchvision==0.18.1 --no-cache-dir && \
pip install spacy==3.7.4 && \
Expand Down
55 changes: 46 additions & 9 deletions Pipes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@
except ImportError:
img_import_success = False

from ezlocalai.VLM import VLM


class Pipes:
def __init__(self):
load_dotenv()
global img_import_success
self.current_llm = getenv("DEFAULT_MODEL")
self.current_vlm = getenv("VISION_MODEL")
self.llm = None
self.vlm = None
self.ctts = None
self.stt = None
self.embedder = None
Expand All @@ -35,6 +39,15 @@ def __init__(self):
logging.info(f"[LLM] {self.current_llm} model loaded successfully.")
if getenv("EMBEDDING_ENABLED").lower() == "true":
self.embedder = Embedding()
if self.current_vlm != "":
logging.info(f"[VLM] {self.current_vlm} model loading. Please wait...")
try:
self.vlm = VLM(model=self.current_vlm)
except Exception as e:
logging.error(f"[VLM] Failed to load the model: {e}")
self.vlm = None
if self.vlm is not None:
logging.info(f"[ezlocalai] Vision is enabled with {self.current_vlm}.")
if getenv("TTS_ENABLED").lower() == "true":
logging.info(f"[CTTS] xttsv2_2.0.2 model loading. Please wait...")
self.ctts = CTTS()
Expand All @@ -44,6 +57,11 @@ def __init__(self):
logging.info(f"[STT] {self.current_stt} model loading. Please wait...")
self.stt = STT(model=self.current_stt)
logging.info(f"[STT] {self.current_stt} model loaded successfully.")
if is_vision_model(self.current_llm):
if self.vlm is None:
self.vlm = self.llm
if self.current_llm == "none" and self.vlm is not None:
self.llm = self.vlm
NGROK_TOKEN = getenv("NGROK_TOKEN")
if NGROK_TOKEN:
ngrok.set_auth_token(NGROK_TOKEN)
Expand Down Expand Up @@ -165,20 +183,39 @@ async def get_response(self, data, completion_type="chat"):
if completion_type == "chat"
else data["prompt"]
)
if images:
data["messages"][-1]["content"] = [
if self.vlm and images:
new_messages = [
{
"type": "text",
"text": user_message,
"role": "user",
"content": [
{
"type": "text",
"text": "Describe each stage of this image.",
},
],
}
]
data["messages"][-1]["content"].extend(images)
if completion_type == "chat":
new_messages[0]["content"].extend(images)
try:
response = self.llm.chat(**data)
image_description = self.vlm.chat(messages=new_messages)
print(
f"Image Description: {image_description['choices'][0]['message']['content']}"
)
prompt = (
f"\n\nSee the uploaded image description for any questions about the uploaded image. Act as if you can see the image based on the description. Do not mention 'uploaded image description' in response. Uploaded Image Description: {image_description['choices'][0]['message']['content']}\n\n{data['messages'][-1]['content'][0]['text']}"
if completion_type == "chat"
else f"\n\nSee the uploaded image description for any questions about the uploaded image. Act as if you can see the image based on the description. Do not mention 'uploaded image description' in response. Uploaded Image Description: {image_description['choices'][0]['message']['content']}\n\n{data['prompt']}"
)
print(f"Full Prompt: {prompt}")
if completion_type == "chat":
data["messages"][-1]["content"] = prompt
else:
data["prompt"] = prompt
except:
data["messages"][-1]["content"] = user_message
response = self.llm.chat(**data)
logging.warning(f"[VLM] Unable to read image from URL.")
pass
if completion_type == "chat":
response = self.llm.chat(**data)
else:
response = self.llm.completion(**data)
generated_image = None
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ services:
- WHISPER_MODEL=${WHISPER_MODEL-base.en}
- IMG_ENABLED=${IMG_ENABLED-false}
- IMG_DEVICE=${IMG_DEVICE-cpu}
- VISION_MODEL=${VISION_MODEL}
- LLM_BATCH_SIZE=${LLM_BATCH_SIZE-1024}
- SD_MODEL=${SD_MODEL}
restart: unless-stopped
Expand Down
181 changes: 181 additions & 0 deletions ezlocalai/VLM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
try:
from deepseek_vl.models import VLChatProcessor
except:
VLChatProcessor = None
from transformers import AutoModelForCausalLM
from datetime import datetime
import requests
import torch
import PIL.Image
import uuid
import os
import base64
from ezlocalai.Helpers import get_tokens


class VLM:
def __init__(self, model="deepseek-ai/deepseek-vl-1.3b-chat"):
self.model = model.split("/")[-1]
self.params = {}
os.makedirs(os.path.join(os.getcwd(), "outputs"), exist_ok=True)
try:
self.vl_chat_processor = VLChatProcessor.from_pretrained(model)
self.tokenizer = self.vl_chat_processor.tokenizer
self.vl_gpt = AutoModelForCausalLM.from_pretrained(
model,
trust_remote_code=True,
cache_dir=os.path.join(os.getcwd(), "models"),
)
if torch.cuda.is_available():
self.vl_gpt = self.vl_gpt.to(torch.bfloat16).cuda().eval()
else:
self.vl_gpt = self.vl_gpt.to(torch.bfloat16).eval()
except Exception as e:
print(f"[VLM] Error: {e}")
self.vl_chat_processor = None
self.tokenizer = None
self.vl_gpt = None

def chat(self, messages, **kwargs):
pil_images = []
images = []
prompt = ""
for message in messages:
if isinstance(message["content"], str):
role = message["role"] if "role" in message else "User"
if role.lower() == "user":
prompt += f"{message['content']}\n\n"
if role.lower() == "system":
prompt = f"System: {message['content']}\n\nUser: {prompt}"
if isinstance(message["content"], list):
for msg in message["content"]:
if "text" in msg:
role = message["role"] if "role" in message else "User"
if role.lower() == "user":
prompt += f"{msg['text']}\n\n"
if "image_url" in msg:
url = str(
msg["image_url"]["url"]
if "url" in msg["image_url"]
else msg["image_url"]
)
image_path = f"./outputs/{uuid.uuid4().hex}.jpg"
if url.startswith("http"):
image = requests.get(url).content
else:
file_type = url.split(",")[0].split("/")[1].split(";")[0]
if file_type == "jpeg":
file_type = "jpg"
image_path = f"./outputs/{uuid.uuid4().hex}.{file_type}"
if "," in url:
image = base64.b64decode(url.split(",")[1])
else:
image = base64.b64decode(url)
with open(image_path, "wb") as f:
f.write(image)
images.append(image_path)
pil_img = PIL.Image.open(image_path)
pil_img = pil_img.convert("RGB")
pil_images.append(pil_img)
if len(images) > 0:
for image in images:
prompt = f"<image_placeholder> {prompt}"
conversation = [
{"role": "User", "content": prompt, "images": images},
{"role": "Assistant", "content": ""},
]
prepare_inputs = self.vl_chat_processor(
conversations=conversation, images=pil_images, force_batchify=True
).to(self.vl_gpt.device)
inputs_embeds = self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
outputs = self.vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=self.tokenizer.eos_token_id,
bos_token_id=self.tokenizer.bos_token_id,
eos_token_id=self.tokenizer.eos_token_id,
max_new_tokens=1024,
do_sample=False,
use_cache=True,
)
answer = self.tokenizer.decode(
outputs[0].cpu().tolist(), skip_special_tokens=True
)
completion_tokens = get_tokens(answer)
prompt_tokens = get_tokens(
" ".join([message["content"] for message in conversation])
)
total_tokens = completion_tokens + prompt_tokens
data = {
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {"content": answer, "role": "assistant"},
"logprobs": None,
}
],
"created": datetime.now().isoformat(),
"id": f"chatcmpl-{uuid.uuid4().hex}",
"model": self.model,
"object": "chat.completion",
"usage": {
"completion_tokens": completion_tokens,
"prompt_tokens": prompt_tokens,
"total_tokens": total_tokens,
},
}
return data

def completion(self, prompt, **kwargs):
messages = [
{"role": "User", "content": prompt},
]
completion = self.chat(
messages=messages,
max_tokens=kwargs["max_tokens"] if "max_tokens" in kwargs else 1024,
)
data = {
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": None,
"text": completion["choices"][0]["message"]["content"],
}
],
"created": datetime.now().isoformat(),
"id": f"cmpl-{uuid.uuid4().hex}",
"model": self.model,
"object": "text_completion",
"usage": {
"completion_tokens": completion["usage"]["completion_tokens"],
"prompt_tokens": completion["usage"]["prompt_tokens"],
"total_tokens": completion["usage"]["total_tokens"],
},
}
return data

def describe_image(self, image_url):
messages = [
{
"role": "User",
"content": [
{"type": "image_url", "image_url": {"url": image_url}},
{
"type": "text",
"text": "Describe each stage of this image.",
},
],
},
]
response = self.chat(
messages=messages,
)
return response["choices"][0]["message"]["content"]

def models(self):
return [
"deepseek-ai/deepseek-vl-1.3b-chat",
"deepseek-ai/deepseek-vl-7b-chat",
]

0 comments on commit f16e16b

Please sign in to comment.