PrefectHQ · jlowin · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024
diff --git a/docs/ai/speech.md → docs/ai/audio/speech.md b/docs/ai/speech.md → docs/ai/audio/speech.md
diff --git a/docs/ai/painting.md → docs/ai/images/painting.md b/docs/ai/painting.md → docs/ai/images/painting.md
diff --git a/docs/ai/casting.md → docs/ai/text/casting.md b/docs/ai/casting.md → docs/ai/text/casting.md
diff --git a/docs/ai/classification.md → docs/ai/text/classification.md b/docs/ai/classification.md → docs/ai/text/classification.md
diff --git a/docs/ai/extraction.md → docs/ai/text/extraction.md b/docs/ai/extraction.md → docs/ai/text/extraction.md
diff --git a/docs/ai/function.md → docs/ai/text/function.md b/docs/ai/function.md → docs/ai/text/function.md
diff --git a/docs/ai/generation.md → docs/ai/text/generation.md b/docs/ai/generation.md → docs/ai/text/generation.md
diff --git a/docs/ai/vision/captioning.md b/docs/ai/vision/captioning.md
@@ -0,0 +1,41 @@
+# Generating images
+
+Marvin can use OpenAI's vision API to process images as inputs. 
+
+!!! tip "Beta"
+    Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+<div class="admonition abstract">
+  <p class="admonition-title">What it does</p>
+  <p>
+    The <code>caption</code> function generates text from images.
+  </p>
+</div>
+
+
+
+!!! example
+
+    Generate a description of the following image, hypothetically available at `/path/to/marvin.jpg`:
+
+    ![](/assets/images/core/vision/marvin.webp)
+
+
+    ```python
+    import marvin
+    from pathlib import Path
+
+    marvin.caption(image=Path('/path/to/marvin.jpg'))
+    ```
+
+    !!! success "Result"
+        "This is a digital illustration featuring a stylized, cute character resembling a Funko Pop vinyl figure with large, shiny eyes and a square-shaped head, sitting on abstract wavy shapes that simulate a landscape. The whimsical figure is set against a dark background with sparkling, colorful bokeh effects, giving it a magical, dreamy atmosphere."
+
+
+<div class="admonition info">
+  <p class="admonition-title">How it works</p>
+  <p>
+    Marvin passes your images to the OpenAI vision API as part of a larger prompt.
+  </p>
+</div>
+
diff --git a/docs/api_reference/ai/beta/vision.md b/docs/api_reference/ai/beta/vision.md
@@ -0,0 +1,6 @@
+# Vision tools
+
+!!! tip "Beta"
+    Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+::: marvin.ai.beta.vision
diff --git a/docs/api_reference/requests.md b/docs/api_reference/requests.md
diff --git a/docs/api_reference/types.md b/docs/api_reference/types.md
@@ -0,0 +1 @@
+::: marvin.types
diff --git a/docs/assets/images/core/vision/marvin.webp b/docs/assets/images/core/vision/marvin.webp
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -15,18 +15,20 @@ nav:
     # - Overview: welcome/overview.md
 
     - Text and data:
-      - AI functions: ai/function.md
-      - Structured data: ai/casting.md
-      - Entity extraction: ai/extraction.md
-      - Classification: ai/classification.md
-      - Generating synthetic data: ai/generation.md
+      - AI functions: ai/text/function.md
+      - Structured data: ai/text/casting.md
+      - Entity extraction: ai/text/extraction.md
+      - Classification: ai/text/classification.md
+      - Generating synthetic data: ai/text/generation.md
 
     - Images:
-      - Creating images: ai/painting.md
-      # - Captioning: ai/function.md
+      - Creating images: ai/images/painting.md
+
+    - Vision:
+      - Captioning: ai/vision/captioning.md
 
     - Audio:
-      - Text-to-speech: ai/speech.md
+      - Text-to-speech: ai/audio/speech.md
     # - Transcription: ai/function.md
 
     - Configuration: 
@@ -42,8 +44,11 @@ nav:
       - marvin.ai.images: api_reference/ai/images.md
       - marvin.ai.audio: api_reference/ai/audio.md
 
+    - Beta AI modules:
+      - marvin.ai.beta.vision: api_reference/ai/beta/vision.md
+
     - Object schemas: 
-      - marvin.requests: api_reference/requests.md
+      - marvin.types: api_reference/types.md
     - Settings:
       - marvin.settings: api_reference/settings.md
     - Utilities:

diff --git a/src/marvin/__init__.py b/src/marvin/__init__.py
@@ -3,6 +3,7 @@
 from .ai.text import fn, cast, extract, classify, classifier, generate, model, Model
 from .ai.images import paint, image
 from .ai.audio import speak, speech
+from .ai.beta.vision import caption
 
 try:
     from ._version import version as __version__
@@ -26,6 +27,8 @@
     # --- audio ---
     "speak",
     "speech",
+    # --- vision (beta) ---
+    "caption",
 ]
 
 

diff --git a/src/marvin/ai/beta/__init__.py b/src/marvin/ai/beta/__init__.py
diff --git a/src/marvin/ai/beta/vision.py b/src/marvin/ai/beta/vision.py
@@ -0,0 +1,115 @@
+from pathlib import Path
+from typing import (
+    TypeVar,
+    Union,
+)
+
+from pydantic import BaseModel
+
+import marvin
+import marvin.utilities.tools
+from marvin.ai.prompts.vision_prompts import CAPTION_PROMPT
+from marvin.client.openai import MarvinClient
+from marvin.types import (
+    BaseMessage,
+    ChatResponse,
+    MessageImageURLContent,
+    VisionRequest,
+)
+from marvin.utilities.images import image_to_base64
+from marvin.utilities.jinja import Transcript
+from marvin.utilities.logging import get_logger
+
+T = TypeVar("T")
+M = TypeVar("M", bound=BaseModel)
+
+logger = get_logger(__name__)
+
+
+def generate_llm_response(
+    prompt_template: str,
+    images: list[Union[str, Path]],
+    prompt_kwargs: dict = None,
+    model_kwargs: dict = None,
+) -> ChatResponse:
+    """
+    Generates a language model response based on a provided prompt template.
+
+    This function uses a language model to generate a response based on a
+    provided prompt template. The function supports additional arguments for the
+    prompt and the language model.
+
+    Args:
+        prompt_template (str): The template for the prompt.
+        images (list[Union[str, Path]]): The images to be
+            used in the prompt. Can be either URLs or local paths.
+        prompt_kwargs (dict, optional): Additional keyword arguments
+            for the prompt. Defaults to None.
+        model_kwargs (dict, optional): Additional keyword arguments
+            for the language model. Defaults to None.
+
+    Returns:
+        ChatResponse: The generated response from the language model.
+    """
+    model_kwargs = model_kwargs or {}
+    prompt_kwargs = prompt_kwargs or {}
+    messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
+    if images is not None:
+        for image in images:
+            # if images are local paths, convert them to base64. Otherwise
+            # assume they are URLs
+            if isinstance(image, Path):
+                b64_image = image_to_base64(image)
+                url = f"data:image/jpeg;base64,{b64_image}"
+            else:
+                url = image
+
+            messages.append(
+                BaseMessage(
+                    role="user",
+                    content=[MessageImageURLContent(image_url=dict(url=url))],
+                )
+            )
+
+    request = VisionRequest(messages=messages, **model_kwargs)
+    if marvin.settings.log_verbose:
+        logger.debug_kv("Request", request.model_dump_json(indent=2))
+    response = MarvinClient().generate_vision(
+        **request.model_dump(exclude_none=True, exclude_unset=True)
+    )
+    if marvin.settings.log_verbose:
+        logger.debug_kv("Response", response.model_dump_json(indent=2))
+    return ChatResponse(request=request, response=response)
+
+
+def caption(
+    image: Union[str, Path],
+    instructions: str = None,
+    model_kwargs: dict = None,
+) -> str:
+    """
+    Generates a caption for an image.
+
+    This function uses a language model to generate a caption for an image. The
+    function supports additional arguments for the language model.
+
+    Args:
+        image (Union[str, Path]): The URL or local path of the
+            image to be captioned.
+        instructions (str, optional): Specific instructions for
+            the caption. Defaults to None.
+        model_kwargs (dict, optional): Additional keyword
+            arguments for the language model. Defaults to None.
+
+    Returns:
+        str: The generated caption.
+    """
+    model_kwargs = model_kwargs or {}
+    response = generate_llm_response(
+        prompt_template=CAPTION_PROMPT,
+        images=[image],
+        prompt_kwargs=dict(instructions=instructions),
+        model_kwargs=model_kwargs,
+    )
+    return response.response.choices[0].message.content
diff --git a/src/marvin/ai/prompts/vision_prompts.py b/src/marvin/ai/prompts/vision_prompts.py
@@ -0,0 +1,15 @@
+import inspect
+
+CAPTION_PROMPT = inspect.cleandoc(
+    """
+    Generate a descriptive caption the following image, and pay attention to any
+    additional instructions. Do not respond directly to the user ("you"), as
+    your response will become the input for other text processing functions.
+
+    {% if instructions -%}
+    ## Instructions
+
+    {{ instructions }}
+    {% endif %}
+    """
+)
diff --git a/src/marvin/ai/text.py b/src/marvin/ai/text.py
@@ -44,9 +44,7 @@
 
 
 def generate_llm_response(
-    prompt_template: str,
-    prompt_kwargs: dict = None,
-    model_kwargs: dict = None,
+    prompt_template: str, prompt_kwargs: dict = None, model_kwargs: dict = None
 ) -> ChatResponse:
     """
     Generates a language model response based on a provided prompt template.
@@ -65,6 +63,7 @@ def generate_llm_response(
     model_kwargs = model_kwargs or {}
     prompt_kwargs = prompt_kwargs or {}
     messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
     request = ChatRequest(messages=messages, **model_kwargs)
     if marvin.settings.log_verbose:
         logger.debug_kv("Request", request.model_dump_json(indent=2))

diff --git a/src/marvin/client/openai.py b/src/marvin/client/openai.py
@@ -18,7 +18,7 @@
 
 import marvin
 from marvin import settings
-from marvin.types import ChatRequest, ImageRequest
+from marvin.types import ChatRequest, ImageRequest, VisionRequest
 
 if TYPE_CHECKING:
     from openai._base_client import HttpxBinaryResponseContent
@@ -70,7 +70,21 @@ def generate_chat(
         )
         # validate request
         request = ChatRequest(**kwargs)
-        response: "ChatCompletion" = create(**request.model_dump())
+        response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
+        return response
+
+    def generate_vision(
+        self,
+        *,
+        completion: Optional[Callable[..., "ChatCompletion"]] = None,
+        **kwargs: Any,
+    ) -> Union["ChatCompletion", T]:
+        create: Callable[..., "ChatCompletion"] = (
+            completion or self.client.chat.completions.create
+        )
+        # validate request
+        request = VisionRequest(**kwargs)
+        response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
         return response
 
     def generate_image(
@@ -79,7 +93,7 @@ def generate_image(
     ) -> "ImagesResponse":
         # validate request
         request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
-        return self.client.images.generate(**request.model_dump())
+        return self.client.images.generate(**request.model_dump(exclude_none=True))
 
     def generate_speech(
         self,
@@ -119,7 +133,23 @@ async def generate_chat(
         create = self.client.chat.completions.create
         # validate request
         request = ChatRequest(**kwargs)
-        response: "ChatCompletion" = await create(request.model_dump())
+        response: "ChatCompletion" = await create(request.model_dump(exclude_none=True))
+        return response
+
+    async def generate_vision(
+        self,
+        *,
+        completion: Optional[Callable[..., "ChatCompletion"]] = None,
+        **kwargs: Any,
+    ) -> Union["ChatCompletion", T]:
+        create: Callable[..., "ChatCompletion"] = (
+            completion or self.client.chat.completions.create
+        )
+        # validate request
+        request = VisionRequest(**kwargs)
+        response: "ChatCompletion" = await create(
+            **request.model_dump(exclude_none=True)
+        )
         return response
 
     async def generate_image(
@@ -128,7 +158,9 @@ async def generate_image(
     ) -> "ImagesResponse":
         # validate request
         request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
-        return await self.client.images.generate(**request.model_dump())
+        return await self.client.images.generate(
+            **request.model_dump(exclude_none=True)
+        )
 
     async def generate_audio(
         self,

diff --git a/src/marvin/settings.py b/src/marvin/settings.py
@@ -56,8 +56,24 @@ def encoder(self):
         return tiktoken.encoding_for_model(self.model).encode
 
 
+class ChatVisionSettings(MarvinSettings):
+    model_config = SettingsConfigDict(env_prefix="marvin_chat_vision_")
+    model: str = Field(
+        description="The default vision model to use.", default="gpt-4-vision-preview"
+    )
+    temperature: float = Field(description="The default temperature to use.", default=1)
+    max_tokens: int = 500
+
+    @property
+    def encoder(self):
+        import tiktoken
+
+        return tiktoken.encoding_for_model(self.model).encode
+
+
 class ChatSettings(MarvinSettings):
     completions: ChatCompletionSettings = Field(default_factory=ChatCompletionSettings)
+    vision: ChatVisionSettings = Field(default_factory=ChatVisionSettings)
 
 
 class ImageSettings(MarvinSettings):