diff --git a/docs/ai/speech.md b/docs/ai/audio/speech.md
similarity index 100%
rename from docs/ai/speech.md
rename to docs/ai/audio/speech.md
diff --git a/docs/ai/painting.md b/docs/ai/images/painting.md
similarity index 100%
rename from docs/ai/painting.md
rename to docs/ai/images/painting.md
diff --git a/docs/ai/casting.md b/docs/ai/text/casting.md
similarity index 100%
rename from docs/ai/casting.md
rename to docs/ai/text/casting.md
diff --git a/docs/ai/classification.md b/docs/ai/text/classification.md
similarity index 100%
rename from docs/ai/classification.md
rename to docs/ai/text/classification.md
diff --git a/docs/ai/extraction.md b/docs/ai/text/extraction.md
similarity index 100%
rename from docs/ai/extraction.md
rename to docs/ai/text/extraction.md
diff --git a/docs/ai/function.md b/docs/ai/text/function.md
similarity index 100%
rename from docs/ai/function.md
rename to docs/ai/text/function.md
diff --git a/docs/ai/generation.md b/docs/ai/text/generation.md
similarity index 100%
rename from docs/ai/generation.md
rename to docs/ai/text/generation.md
diff --git a/docs/ai/vision/captioning.md b/docs/ai/vision/captioning.md
new file mode 100644
index 000000000..67da84ce6
--- /dev/null
+++ b/docs/ai/vision/captioning.md
@@ -0,0 +1,41 @@
+# Generating images
+
+Marvin can use OpenAI's vision API to process images as inputs. 
+
+!!! tip "Beta"
+    Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+<div class="admonition abstract">
+  <p class="admonition-title">What it does</p>
+  <p>
+    The <code>caption</code> function generates text from images.
+  </p>
+</div>
+
+
+
+!!! example
+
+    Generate a description of the following image, hypothetically available at `/path/to/marvin.jpg`:
+
+    ![](/assets/images/core/vision/marvin.webp)
+
+    
+    ```python
+    import marvin
+    from pathlib import Path
+
+    marvin.caption(image=Path('/path/to/marvin.jpg'))
+    ```
+
+    !!! success "Result"
+        "This is a digital illustration featuring a stylized, cute character resembling a Funko Pop vinyl figure with large, shiny eyes and a square-shaped head, sitting on abstract wavy shapes that simulate a landscape. The whimsical figure is set against a dark background with sparkling, colorful bokeh effects, giving it a magical, dreamy atmosphere."
+    
+
+<div class="admonition info">
+  <p class="admonition-title">How it works</p>
+  <p>
+    Marvin passes your images to the OpenAI vision API as part of a larger prompt.
+  </p>
+</div>
+
diff --git a/docs/api_reference/ai/beta/vision.md b/docs/api_reference/ai/beta/vision.md
new file mode 100644
index 000000000..196493a7c
--- /dev/null
+++ b/docs/api_reference/ai/beta/vision.md
@@ -0,0 +1,6 @@
+# Vision tools
+
+!!! tip "Beta"
+    Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+::: marvin.ai.beta.vision
\ No newline at end of file
diff --git a/docs/api_reference/requests.md b/docs/api_reference/requests.md
deleted file mode 100644
index 962a508d5..000000000
--- a/docs/api_reference/requests.md
+++ /dev/null
@@ -1 +0,0 @@
-::: marvin.requests
\ No newline at end of file
diff --git a/docs/api_reference/types.md b/docs/api_reference/types.md
new file mode 100644
index 000000000..c9091398e
--- /dev/null
+++ b/docs/api_reference/types.md
@@ -0,0 +1 @@
+::: marvin.types
\ No newline at end of file
diff --git a/docs/assets/images/core/vision/marvin.webp b/docs/assets/images/core/vision/marvin.webp
new file mode 100644
index 000000000..f40ea7d15
Binary files /dev/null and b/docs/assets/images/core/vision/marvin.webp differ
diff --git a/mkdocs.yml b/mkdocs.yml
index 3b648c797..3eafe1678 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -15,18 +15,20 @@ nav:
     # - Overview: welcome/overview.md
 
     - Text and data:
-      - AI functions: ai/function.md
-      - Structured data: ai/casting.md
-      - Entity extraction: ai/extraction.md
-      - Classification: ai/classification.md
-      - Generating synthetic data: ai/generation.md
+      - AI functions: ai/text/function.md
+      - Structured data: ai/text/casting.md
+      - Entity extraction: ai/text/extraction.md
+      - Classification: ai/text/classification.md
+      - Generating synthetic data: ai/text/generation.md
 
     - Images:
-      - Creating images: ai/painting.md
-      # - Captioning: ai/function.md
+      - Creating images: ai/images/painting.md
+    
+    - Vision:
+      - Captioning: ai/vision/captioning.md
 
     - Audio:
-      - Text-to-speech: ai/speech.md
+      - Text-to-speech: ai/audio/speech.md
     # - Transcription: ai/function.md
       
     - Configuration: 
@@ -42,8 +44,11 @@ nav:
       - marvin.ai.images: api_reference/ai/images.md
       - marvin.ai.audio: api_reference/ai/audio.md
       
+    - Beta AI modules:
+      - marvin.ai.beta.vision: api_reference/ai/beta/vision.md
+
     - Object schemas: 
-      - marvin.requests: api_reference/requests.md
+      - marvin.types: api_reference/types.md
     - Settings:
       - marvin.settings: api_reference/settings.md
     - Utilities:
diff --git a/src/marvin/__init__.py b/src/marvin/__init__.py
index 2f7343c34..7ad9fc8fc 100644
--- a/src/marvin/__init__.py
+++ b/src/marvin/__init__.py
@@ -3,6 +3,7 @@
 from .ai.text import fn, cast, extract, classify, classifier, generate, model, Model
 from .ai.images import paint, image
 from .ai.audio import speak, speech
+from .ai.beta.vision import caption
 
 try:
     from ._version import version as __version__
@@ -26,6 +27,8 @@
     # --- audio ---
     "speak",
     "speech",
+    # --- vision (beta) ---
+    "caption",
 ]
 
 
diff --git a/src/marvin/ai/beta/__init__.py b/src/marvin/ai/beta/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/marvin/ai/beta/vision.py b/src/marvin/ai/beta/vision.py
new file mode 100644
index 000000000..082b3af36
--- /dev/null
+++ b/src/marvin/ai/beta/vision.py
@@ -0,0 +1,115 @@
+from pathlib import Path
+from typing import (
+    TypeVar,
+    Union,
+)
+
+from pydantic import BaseModel
+
+import marvin
+import marvin.utilities.tools
+from marvin.ai.prompts.vision_prompts import CAPTION_PROMPT
+from marvin.client.openai import MarvinClient
+from marvin.types import (
+    BaseMessage,
+    ChatResponse,
+    MessageImageURLContent,
+    VisionRequest,
+)
+from marvin.utilities.images import image_to_base64
+from marvin.utilities.jinja import Transcript
+from marvin.utilities.logging import get_logger
+
+T = TypeVar("T")
+M = TypeVar("M", bound=BaseModel)
+
+logger = get_logger(__name__)
+
+
+def generate_llm_response(
+    prompt_template: str,
+    images: list[Union[str, Path]],
+    prompt_kwargs: dict = None,
+    model_kwargs: dict = None,
+) -> ChatResponse:
+    """
+    Generates a language model response based on a provided prompt template.
+
+    This function uses a language model to generate a response based on a
+    provided prompt template. The function supports additional arguments for the
+    prompt and the language model.
+
+    Args:
+        prompt_template (str): The template for the prompt.
+        images (list[Union[str, Path]]): The images to be
+            used in the prompt. Can be either URLs or local paths.
+        prompt_kwargs (dict, optional): Additional keyword arguments
+            for the prompt. Defaults to None.
+        model_kwargs (dict, optional): Additional keyword arguments
+            for the language model. Defaults to None.
+
+    Returns:
+        ChatResponse: The generated response from the language model.
+    """
+    model_kwargs = model_kwargs or {}
+    prompt_kwargs = prompt_kwargs or {}
+    messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
+    if images is not None:
+        for image in images:
+            # if images are local paths, convert them to base64. Otherwise
+            # assume they are URLs
+            if isinstance(image, Path):
+                b64_image = image_to_base64(image)
+                url = f"data:image/jpeg;base64,{b64_image}"
+            else:
+                url = image
+
+            messages.append(
+                BaseMessage(
+                    role="user",
+                    content=[MessageImageURLContent(image_url=dict(url=url))],
+                )
+            )
+
+    request = VisionRequest(messages=messages, **model_kwargs)
+    if marvin.settings.log_verbose:
+        logger.debug_kv("Request", request.model_dump_json(indent=2))
+    response = MarvinClient().generate_vision(
+        **request.model_dump(exclude_none=True, exclude_unset=True)
+    )
+    if marvin.settings.log_verbose:
+        logger.debug_kv("Response", response.model_dump_json(indent=2))
+    return ChatResponse(request=request, response=response)
+
+
+def caption(
+    image: Union[str, Path],
+    instructions: str = None,
+    model_kwargs: dict = None,
+) -> str:
+    """
+    Generates a caption for an image.
+
+    This function uses a language model to generate a caption for an image. The
+    function supports additional arguments for the language model.
+
+    Args:
+        image (Union[str, Path]): The URL or local path of the
+            image to be captioned.
+        instructions (str, optional): Specific instructions for
+            the caption. Defaults to None.
+        model_kwargs (dict, optional): Additional keyword
+            arguments for the language model. Defaults to None.
+
+    Returns:
+        str: The generated caption.
+    """
+    model_kwargs = model_kwargs or {}
+    response = generate_llm_response(
+        prompt_template=CAPTION_PROMPT,
+        images=[image],
+        prompt_kwargs=dict(instructions=instructions),
+        model_kwargs=model_kwargs,
+    )
+    return response.response.choices[0].message.content
diff --git a/src/marvin/ai/prompts/vision_prompts.py b/src/marvin/ai/prompts/vision_prompts.py
new file mode 100644
index 000000000..bb79ce2b5
--- /dev/null
+++ b/src/marvin/ai/prompts/vision_prompts.py
@@ -0,0 +1,15 @@
+import inspect
+
+CAPTION_PROMPT = inspect.cleandoc(
+    """
+    Generate a descriptive caption for the following image, and pay attention to any
+    additional instructions. Do not respond directly to the user ("you"), as
+    your response will become the input for other text processing functions.
+
+    {% if instructions -%}
+    ## Instructions
+    
+    {{ instructions }}
+    {% endif %}
+    """
+)
diff --git a/src/marvin/ai/text.py b/src/marvin/ai/text.py
index 1f108f41e..f902f5604 100644
--- a/src/marvin/ai/text.py
+++ b/src/marvin/ai/text.py
@@ -10,6 +10,7 @@
     Callable,
     GenericAlias,
     Literal,
+    Optional,
     Type,
     TypeVar,
     Union,
@@ -45,8 +46,8 @@
 
 def generate_llm_response(
     prompt_template: str,
-    prompt_kwargs: dict = None,
-    model_kwargs: dict = None,
+    prompt_kwargs: Optional[dict] = None,
+    model_kwargs: Optional[dict] = None,
 ) -> ChatResponse:
     """
     Generates a language model response based on a provided prompt template.
@@ -65,6 +66,7 @@ def generate_llm_response(
     model_kwargs = model_kwargs or {}
     prompt_kwargs = prompt_kwargs or {}
     messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
     request = ChatRequest(messages=messages, **model_kwargs)
     if marvin.settings.log_verbose:
         logger.debug_kv("Request", request.model_dump_json(indent=2))
diff --git a/src/marvin/client/openai.py b/src/marvin/client/openai.py
index 8421e093a..73becd06b 100644
--- a/src/marvin/client/openai.py
+++ b/src/marvin/client/openai.py
@@ -18,7 +18,7 @@
 
 import marvin
 from marvin import settings
-from marvin.types import ChatRequest, ImageRequest
+from marvin.types import ChatRequest, ImageRequest, VisionRequest
 
 if TYPE_CHECKING:
     from openai._base_client import HttpxBinaryResponseContent
@@ -70,7 +70,21 @@ def generate_chat(
         )
         # validate request
         request = ChatRequest(**kwargs)
-        response: "ChatCompletion" = create(**request.model_dump())
+        response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
+        return response
+
+    def generate_vision(
+        self,
+        *,
+        completion: Optional[Callable[..., "ChatCompletion"]] = None,
+        **kwargs: Any,
+    ) -> Union["ChatCompletion", T]:
+        create: Callable[..., "ChatCompletion"] = (
+            completion or self.client.chat.completions.create
+        )
+        # validate request
+        request = VisionRequest(**kwargs)
+        response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
         return response
 
     def generate_image(
@@ -79,7 +93,7 @@ def generate_image(
     ) -> "ImagesResponse":
         # validate request
         request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
-        return self.client.images.generate(**request.model_dump())
+        return self.client.images.generate(**request.model_dump(exclude_none=True))
 
     def generate_speech(
         self,
@@ -119,7 +133,23 @@ async def generate_chat(
         create = self.client.chat.completions.create
         # validate request
         request = ChatRequest(**kwargs)
-        response: "ChatCompletion" = await create(request.model_dump())
+        response: "ChatCompletion" = await create(request.model_dump(exclude_none=True))
+        return response
+
+    async def generate_vision(
+        self,
+        *,
+        completion: Optional[Callable[..., "ChatCompletion"]] = None,
+        **kwargs: Any,
+    ) -> Union["ChatCompletion", T]:
+        create: Callable[..., "ChatCompletion"] = (
+            completion or self.client.chat.completions.create
+        )
+        # validate request
+        request = VisionRequest(**kwargs)
+        response: "ChatCompletion" = await create(
+            **request.model_dump(exclude_none=True)
+        )
         return response
 
     async def generate_image(
@@ -128,7 +158,9 @@ async def generate_image(
     ) -> "ImagesResponse":
         # validate request
         request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
-        return await self.client.images.generate(**request.model_dump())
+        return await self.client.images.generate(
+            **request.model_dump(exclude_none=True)
+        )
 
     async def generate_audio(
         self,
diff --git a/src/marvin/settings.py b/src/marvin/settings.py
index 5d1de248e..85eab6e23 100644
--- a/src/marvin/settings.py
+++ b/src/marvin/settings.py
@@ -56,8 +56,24 @@ def encoder(self):
         return tiktoken.encoding_for_model(self.model).encode
 
 
+class ChatVisionSettings(MarvinSettings):
+    model_config = SettingsConfigDict(env_prefix="marvin_chat_vision_")
+    model: str = Field(
+        description="The default vision model to use.", default="gpt-4-vision-preview"
+    )
+    temperature: float = Field(description="The default temperature to use.", default=1)
+    max_tokens: int = 500
+
+    @property
+    def encoder(self):
+        import tiktoken
+
+        return tiktoken.encoding_for_model(self.model).encode
+
+
 class ChatSettings(MarvinSettings):
     completions: ChatCompletionSettings = Field(default_factory=ChatCompletionSettings)
+    vision: ChatVisionSettings = Field(default_factory=ChatVisionSettings)
 
 
 class ImageSettings(MarvinSettings):
diff --git a/src/marvin/types.py b/src/marvin/types.py
index 10bb35edc..f4389b467 100644
--- a/src/marvin/types.py
+++ b/src/marvin/types.py
@@ -63,8 +63,31 @@ class FunctionCall(BaseModel):
     name: str
 
 
+class ImageUrl(BaseModel):
+    url: str = Field(
+        description="URL of the image to be sent or a base64 encoded image."
+    )
+    detail: str = "auto"
+
+
+class MessageImageURLContent(BaseModel):
+    """Schema for messages containing images"""
+
+    type: Literal["image_url"] = "image_url"
+    image_url: ImageUrl
+
+
+class MessageTextContent(BaseModel):
+    """Schema for messages containing text"""
+
+    type: Literal["text"] = "text"
+    text: str
+
+
 class BaseMessage(BaseModel):
-    content: str
+    """Base schema for messages"""
+
+    content: Union[str, list[Union[MessageImageURLContent, MessageTextContent]]]
     role: str
 
 
@@ -103,9 +126,33 @@ class ChatRequest(Prompt[T]):
     user: Optional[str] = None
 
 
+class VisionRequest(BaseModel):
+    messages: list[BaseMessage] = Field(default_factory=list)
+    model: str = Field(default_factory=lambda: settings.openai.chat.vision.model)
+    logit_bias: Optional[LogitBias] = None
+    max_tokens: Optional[Annotated[int, Field(strict=True, ge=1)]] = Field(
+        default_factory=lambda: settings.openai.chat.vision.max_tokens
+    )
+    frequency_penalty: Optional[
+        Annotated[float, Field(strict=True, ge=-2.0, le=2.0)]
+    ] = 0
+    n: Optional[Annotated[int, Field(strict=True, ge=1)]] = 1
+    presence_penalty: Optional[
+        Annotated[float, Field(strict=True, ge=-2.0, le=2.0)]
+    ] = 0
+    seed: Optional[int] = None
+    stop: Optional[Union[str, list[str]]] = None
+    stream: Optional[bool] = False
+    temperature: Optional[Annotated[float, Field(strict=True, ge=0, le=2)]] = Field(
+        default_factory=lambda: settings.openai.chat.vision.temperature
+    )
+    top_p: Optional[Annotated[float, Field(strict=True, ge=0, le=1)]] = 1
+    user: Optional[str] = None
+
+
 class ChatResponse(BaseModel):
     model_config = dict(arbitrary_types_allowed=True)
-    request: ChatRequest
+    request: Union[ChatRequest, VisionRequest]
     response: ChatCompletion
     tool_outputs: list[Any] = []
 
@@ -113,6 +160,7 @@ class ChatResponse(BaseModel):
 class ImageRequest(BaseModel):
     prompt: str
     model: Optional[str] = Field(default_factory=lambda: settings.openai.images.model)
+
     n: Optional[int] = 1
     quality: Optional[Literal["standard", "hd"]] = Field(
         default_factory=lambda: settings.openai.images.quality
diff --git a/src/marvin/utilities/images.py b/src/marvin/utilities/images.py
new file mode 100644
index 000000000..94c38647d
--- /dev/null
+++ b/src/marvin/utilities/images.py
@@ -0,0 +1,18 @@
+import base64
+from pathlib import Path
+from typing import Union
+
+
+def image_to_base64(image_path: Union[str, Path]) -> str:
+    """
+    Converts a local image file to a base64 string.
+
+    Args:
+        image_path (Union[str, Path]): The path to the image file. This can be a
+            string or a Path object.
+
+    Returns:
+        str: The base64 representation of the image.
+    """
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
diff --git a/tests/apis/test_extract.py b/tests/apis/test_extract.py
index cab6e6d6e..168fceb36 100644
--- a/tests/apis/test_extract.py
+++ b/tests/apis/test_extract.py
@@ -44,7 +44,11 @@ def test_extract_names(self):
             )
             assert result == ["John", "Mary", "Bob"]
 
+        @pytest.mark.flaky(max_runs=3)
         def test_float_to_int(self):
+            # gpt 3.5 sometimes struggles with this test, marked as flaky
+            # pydantic no longer casts floats to ints, but gpt-3.5 assumes it's
+            # ok even when given instructions not to. GPT-4 seems to work ok.
             result = marvin.extract("the numbers are 1, 2, and 3.2", int)
             assert result == [1, 2, 3]