fix: added openai vision support

ErikBjare · Aug 13, 2024 · 6bbec93 · 6bbec93
1 parent f184607
commit 6bbec93
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 15 deletions.
diff --git a/gptme/llm.py b/gptme/llm.py
@@ -166,7 +166,8 @@ def _stream_openai(messages: list[Message], model: str) -> Generator[str, None,
         top_p=top_p,
         stream=True,
         # the llama-cpp-python server needs this explicitly set, otherwise unreliable results
-        max_tokens=1000 if not model.startswith("gpt-") else None,
+        # TODO: make this better
+        max_tokens=1000 if not model.startswith("gpt-") else 4096,
     ):
         if not chunk.choices:  # type: ignore
             # Got a chunk with no choices, Azure always sends one of these at the start

diff --git a/gptme/message.py b/gptme/message.py
@@ -68,18 +68,24 @@ def __eq__(self, other):
     def to_dict(self, keys=None, anthropic=False) -> dict:
         """Return a dict representation of the message, serializable to JSON."""
         content: str | list[dict | str] = self.content
-
-        # if anthropic, make sure content is a list of dicts, to support multiple types of content
-        if anthropic:
-            content = [{"type": "text", "text": self.content}]
-            for f in self.files:
-                ext = f.suffix[1:]
-                if ext not in ["jpg", "jpeg", "png", "gif"]:
-                    logger.warning("Unsupported file type: %s", ext)
-                    continue
-                else:
-                    logger.warning("Found image file: %s", f)
-                media_type = f"image/{ext}"
+        allowed_file_exts = ["jpg", "jpeg", "png", "gif"]
+
+        content = [{"type": "text", "text": self.content}]
+        for f in self.files:
+            ext = f.suffix[1:]
+            if ext not in allowed_file_exts:
+                logger.warning("Unsupported file type: %s", ext)
+                continue
+            else:
+                logger.warning("Found image file: %s", f)
+            media_type = f"image/{ext}"
+            content.append(
+                {
+                    "type": "text",
+                    "text": f"![{f.name}]({f.name}):",
+                }
+            )
+            if anthropic:
                 content.append(
                     {
                         "type": "image",
@@ -90,8 +96,16 @@ def to_dict(self, keys=None, anthropic=False) -> dict:
                         },
                     }
                 )
-        else:
-            content = self.content
+            else:
+                # OpenAI format
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:{media_type};base64,{base64.b64encode(f.read_bytes()).decode('utf-8')}"
+                        },
+                    }
+                )
 
         d = {
             "role": self.role,