diff --git a/docs/ai/speech.md b/docs/ai/audio/speech.md
similarity index 100%
rename from docs/ai/speech.md
rename to docs/ai/audio/speech.md
diff --git a/docs/ai/painting.md b/docs/ai/images/painting.md
similarity index 100%
rename from docs/ai/painting.md
rename to docs/ai/images/painting.md
diff --git a/docs/ai/casting.md b/docs/ai/text/casting.md
similarity index 100%
rename from docs/ai/casting.md
rename to docs/ai/text/casting.md
diff --git a/docs/ai/classification.md b/docs/ai/text/classification.md
similarity index 100%
rename from docs/ai/classification.md
rename to docs/ai/text/classification.md
diff --git a/docs/ai/extraction.md b/docs/ai/text/extraction.md
similarity index 100%
rename from docs/ai/extraction.md
rename to docs/ai/text/extraction.md
diff --git a/docs/ai/function.md b/docs/ai/text/function.md
similarity index 100%
rename from docs/ai/function.md
rename to docs/ai/text/function.md
diff --git a/docs/ai/generation.md b/docs/ai/text/generation.md
similarity index 100%
rename from docs/ai/generation.md
rename to docs/ai/text/generation.md
diff --git a/docs/ai/vision/captioning.md b/docs/ai/vision/captioning.md
new file mode 100644
index 000000000..67da84ce6
--- /dev/null
+++ b/docs/ai/vision/captioning.md
@@ -0,0 +1,41 @@
+# Generating images
+
+Marvin can use OpenAI's vision API to process images as inputs.
+
+!!! tip "Beta"
+ Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+
+
What it does
+
+ The caption
function generates text from images.
+
+
+
+
+
+!!! example
+
+ Generate a description of the following image, hypothetically available at `/path/to/marvin.jpg`:
+
+ ![](/assets/images/core/vision/marvin.webp)
+
+
+ ```python
+ import marvin
+ from pathlib import Path
+
+ marvin.caption(image=Path('/path/to/marvin.jpg'))
+ ```
+
+ !!! success "Result"
+ "This is a digital illustration featuring a stylized, cute character resembling a Funko Pop vinyl figure with large, shiny eyes and a square-shaped head, sitting on abstract wavy shapes that simulate a landscape. The whimsical figure is set against a dark background with sparkling, colorful bokeh effects, giving it a magical, dreamy atmosphere."
+
+
+
+
How it works
+
+ Marvin passes your images to the OpenAI vision API as part of a larger prompt.
+
+
+
diff --git a/docs/api_reference/ai/beta/vision.md b/docs/api_reference/ai/beta/vision.md
new file mode 100644
index 000000000..196493a7c
--- /dev/null
+++ b/docs/api_reference/ai/beta/vision.md
@@ -0,0 +1,6 @@
+# Vision tools
+
+!!! tip "Beta"
+ Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change.
+
+::: marvin.ai.beta.vision
\ No newline at end of file
diff --git a/docs/api_reference/requests.md b/docs/api_reference/requests.md
deleted file mode 100644
index 962a508d5..000000000
--- a/docs/api_reference/requests.md
+++ /dev/null
@@ -1 +0,0 @@
-::: marvin.requests
\ No newline at end of file
diff --git a/docs/api_reference/types.md b/docs/api_reference/types.md
new file mode 100644
index 000000000..c9091398e
--- /dev/null
+++ b/docs/api_reference/types.md
@@ -0,0 +1 @@
+::: marvin.types
\ No newline at end of file
diff --git a/docs/assets/images/core/vision/marvin.webp b/docs/assets/images/core/vision/marvin.webp
new file mode 100644
index 000000000..f40ea7d15
Binary files /dev/null and b/docs/assets/images/core/vision/marvin.webp differ
diff --git a/mkdocs.yml b/mkdocs.yml
index 3b648c797..3eafe1678 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -15,18 +15,20 @@ nav:
# - Overview: welcome/overview.md
- Text and data:
- - AI functions: ai/function.md
- - Structured data: ai/casting.md
- - Entity extraction: ai/extraction.md
- - Classification: ai/classification.md
- - Generating synthetic data: ai/generation.md
+ - AI functions: ai/text/function.md
+ - Structured data: ai/text/casting.md
+ - Entity extraction: ai/text/extraction.md
+ - Classification: ai/text/classification.md
+ - Generating synthetic data: ai/text/generation.md
- Images:
- - Creating images: ai/painting.md
- # - Captioning: ai/function.md
+ - Creating images: ai/images/painting.md
+
+ - Vision:
+ - Captioning: ai/vision/captioning.md
- Audio:
- - Text-to-speech: ai/speech.md
+ - Text-to-speech: ai/audio/speech.md
# - Transcription: ai/function.md
- Configuration:
@@ -42,8 +44,11 @@ nav:
- marvin.ai.images: api_reference/ai/images.md
- marvin.ai.audio: api_reference/ai/audio.md
+ - Beta AI modules:
+ - marvin.ai.beta.vision: api_reference/ai/beta/vision.md
+
- Object schemas:
- - marvin.requests: api_reference/requests.md
+ - marvin.types: api_reference/types.md
- Settings:
- marvin.settings: api_reference/settings.md
- Utilities:
diff --git a/src/marvin/__init__.py b/src/marvin/__init__.py
index 2f7343c34..7ad9fc8fc 100644
--- a/src/marvin/__init__.py
+++ b/src/marvin/__init__.py
@@ -3,6 +3,7 @@
from .ai.text import fn, cast, extract, classify, classifier, generate, model, Model
from .ai.images import paint, image
from .ai.audio import speak, speech
+from .ai.beta.vision import caption
try:
from ._version import version as __version__
@@ -26,6 +27,8 @@
# --- audio ---
"speak",
"speech",
+ # --- vision (beta) ---
+ "caption",
]
diff --git a/src/marvin/ai/beta/__init__.py b/src/marvin/ai/beta/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/marvin/ai/beta/vision.py b/src/marvin/ai/beta/vision.py
new file mode 100644
index 000000000..082b3af36
--- /dev/null
+++ b/src/marvin/ai/beta/vision.py
@@ -0,0 +1,115 @@
+from pathlib import Path
+from typing import (
+ TypeVar,
+ Union,
+)
+
+from pydantic import BaseModel
+
+import marvin
+import marvin.utilities.tools
+from marvin.ai.prompts.vision_prompts import CAPTION_PROMPT
+from marvin.client.openai import MarvinClient
+from marvin.types import (
+ BaseMessage,
+ ChatResponse,
+ MessageImageURLContent,
+ VisionRequest,
+)
+from marvin.utilities.images import image_to_base64
+from marvin.utilities.jinja import Transcript
+from marvin.utilities.logging import get_logger
+
+T = TypeVar("T")
+M = TypeVar("M", bound=BaseModel)
+
+logger = get_logger(__name__)
+
+
+def generate_llm_response(
+ prompt_template: str,
+ images: list[Union[str, Path]],
+ prompt_kwargs: dict = None,
+ model_kwargs: dict = None,
+) -> ChatResponse:
+ """
+ Generates a language model response based on a provided prompt template.
+
+ This function uses a language model to generate a response based on a
+ provided prompt template. The function supports additional arguments for the
+ prompt and the language model.
+
+ Args:
+ prompt_template (str): The template for the prompt.
+ images (list[Union[str, Path]]): The images to be
+ used in the prompt. Can be either URLs or local paths.
+ prompt_kwargs (dict, optional): Additional keyword arguments
+ for the prompt. Defaults to None.
+ model_kwargs (dict, optional): Additional keyword arguments
+ for the language model. Defaults to None.
+
+ Returns:
+ ChatResponse: The generated response from the language model.
+ """
+ model_kwargs = model_kwargs or {}
+ prompt_kwargs = prompt_kwargs or {}
+ messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
+ if images is not None:
+ for image in images:
+ # if images are local paths, convert them to base64. Otherwise
+ # assume they are URLs
+ if isinstance(image, Path):
+ b64_image = image_to_base64(image)
+ url = f"data:image/jpeg;base64,{b64_image}"
+ else:
+ url = image
+
+ messages.append(
+ BaseMessage(
+ role="user",
+ content=[MessageImageURLContent(image_url=dict(url=url))],
+ )
+ )
+
+ request = VisionRequest(messages=messages, **model_kwargs)
+ if marvin.settings.log_verbose:
+ logger.debug_kv("Request", request.model_dump_json(indent=2))
+ response = MarvinClient().generate_vision(
+ **request.model_dump(exclude_none=True, exclude_unset=True)
+ )
+ if marvin.settings.log_verbose:
+ logger.debug_kv("Response", response.model_dump_json(indent=2))
+ return ChatResponse(request=request, response=response)
+
+
+def caption(
+ image: Union[str, Path],
+ instructions: str = None,
+ model_kwargs: dict = None,
+) -> str:
+ """
+ Generates a caption for an image.
+
+ This function uses a language model to generate a caption for an image. The
+ function supports additional arguments for the language model.
+
+ Args:
+ image (Union[str, Path]): The URL or local path of the
+ image to be captioned.
+ instructions (str, optional): Specific instructions for
+ the caption. Defaults to None.
+ model_kwargs (dict, optional): Additional keyword
+ arguments for the language model. Defaults to None.
+
+ Returns:
+ str: The generated caption.
+ """
+ model_kwargs = model_kwargs or {}
+ response = generate_llm_response(
+ prompt_template=CAPTION_PROMPT,
+ images=[image],
+ prompt_kwargs=dict(instructions=instructions),
+ model_kwargs=model_kwargs,
+ )
+ return response.response.choices[0].message.content
diff --git a/src/marvin/ai/prompts/vision_prompts.py b/src/marvin/ai/prompts/vision_prompts.py
new file mode 100644
index 000000000..bb79ce2b5
--- /dev/null
+++ b/src/marvin/ai/prompts/vision_prompts.py
@@ -0,0 +1,15 @@
+import inspect
+
+CAPTION_PROMPT = inspect.cleandoc(
+ """
+ Generate a descriptive caption for the following image, and pay attention to any
+ additional instructions. Do not respond directly to the user ("you"), as
+ your response will become the input for other text processing functions.
+
+ {% if instructions -%}
+ ## Instructions
+
+ {{ instructions }}
+ {% endif %}
+ """
+)
diff --git a/src/marvin/ai/text.py b/src/marvin/ai/text.py
index 1f108f41e..f902f5604 100644
--- a/src/marvin/ai/text.py
+++ b/src/marvin/ai/text.py
@@ -10,6 +10,7 @@
Callable,
GenericAlias,
Literal,
+ Optional,
Type,
TypeVar,
Union,
@@ -45,8 +46,8 @@
def generate_llm_response(
prompt_template: str,
- prompt_kwargs: dict = None,
- model_kwargs: dict = None,
+ prompt_kwargs: Optional[dict] = None,
+ model_kwargs: Optional[dict] = None,
) -> ChatResponse:
"""
Generates a language model response based on a provided prompt template.
@@ -65,6 +66,7 @@ def generate_llm_response(
model_kwargs = model_kwargs or {}
prompt_kwargs = prompt_kwargs or {}
messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs)
+
request = ChatRequest(messages=messages, **model_kwargs)
if marvin.settings.log_verbose:
logger.debug_kv("Request", request.model_dump_json(indent=2))
diff --git a/src/marvin/client/openai.py b/src/marvin/client/openai.py
index 8421e093a..73becd06b 100644
--- a/src/marvin/client/openai.py
+++ b/src/marvin/client/openai.py
@@ -18,7 +18,7 @@
import marvin
from marvin import settings
-from marvin.types import ChatRequest, ImageRequest
+from marvin.types import ChatRequest, ImageRequest, VisionRequest
if TYPE_CHECKING:
from openai._base_client import HttpxBinaryResponseContent
@@ -70,7 +70,21 @@ def generate_chat(
)
# validate request
request = ChatRequest(**kwargs)
- response: "ChatCompletion" = create(**request.model_dump())
+ response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
+ return response
+
+ def generate_vision(
+ self,
+ *,
+ completion: Optional[Callable[..., "ChatCompletion"]] = None,
+ **kwargs: Any,
+ ) -> Union["ChatCompletion", T]:
+ create: Callable[..., "ChatCompletion"] = (
+ completion or self.client.chat.completions.create
+ )
+ # validate request
+ request = VisionRequest(**kwargs)
+ response: "ChatCompletion" = create(**request.model_dump(exclude_none=True))
return response
def generate_image(
@@ -79,7 +93,7 @@ def generate_image(
) -> "ImagesResponse":
# validate request
request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
- return self.client.images.generate(**request.model_dump())
+ return self.client.images.generate(**request.model_dump(exclude_none=True))
def generate_speech(
self,
@@ -119,7 +133,23 @@ async def generate_chat(
create = self.client.chat.completions.create
# validate request
request = ChatRequest(**kwargs)
- response: "ChatCompletion" = await create(request.model_dump())
+ response: "ChatCompletion" = await create(request.model_dump(exclude_none=True))
+ return response
+
+ async def generate_vision(
+ self,
+ *,
+ completion: Optional[Callable[..., "ChatCompletion"]] = None,
+ **kwargs: Any,
+ ) -> Union["ChatCompletion", T]:
+ create: Callable[..., "ChatCompletion"] = (
+ completion or self.client.chat.completions.create
+ )
+ # validate request
+ request = VisionRequest(**kwargs)
+ response: "ChatCompletion" = await create(
+ **request.model_dump(exclude_none=True)
+ )
return response
async def generate_image(
@@ -128,7 +158,9 @@ async def generate_image(
) -> "ImagesResponse":
# validate request
request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs)
- return await self.client.images.generate(**request.model_dump())
+ return await self.client.images.generate(
+ **request.model_dump(exclude_none=True)
+ )
async def generate_audio(
self,
diff --git a/src/marvin/settings.py b/src/marvin/settings.py
index 5d1de248e..85eab6e23 100644
--- a/src/marvin/settings.py
+++ b/src/marvin/settings.py
@@ -56,8 +56,24 @@ def encoder(self):
return tiktoken.encoding_for_model(self.model).encode
+class ChatVisionSettings(MarvinSettings):
+ model_config = SettingsConfigDict(env_prefix="marvin_chat_vision_")
+ model: str = Field(
+ description="The default vision model to use.", default="gpt-4-vision-preview"
+ )
+ temperature: float = Field(description="The default temperature to use.", default=1)
+ max_tokens: int = 500
+
+ @property
+ def encoder(self):
+ import tiktoken
+
+ return tiktoken.encoding_for_model(self.model).encode
+
+
class ChatSettings(MarvinSettings):
completions: ChatCompletionSettings = Field(default_factory=ChatCompletionSettings)
+ vision: ChatVisionSettings = Field(default_factory=ChatVisionSettings)
class ImageSettings(MarvinSettings):
diff --git a/src/marvin/types.py b/src/marvin/types.py
index 10bb35edc..f4389b467 100644
--- a/src/marvin/types.py
+++ b/src/marvin/types.py
@@ -63,8 +63,31 @@ class FunctionCall(BaseModel):
name: str
+class ImageUrl(BaseModel):
+ url: str = Field(
+ description="URL of the image to be sent or a base64 encoded image."
+ )
+ detail: str = "auto"
+
+
+class MessageImageURLContent(BaseModel):
+ """Schema for messages containing images"""
+
+ type: Literal["image_url"] = "image_url"
+ image_url: ImageUrl
+
+
+class MessageTextContent(BaseModel):
+ """Schema for messages containing text"""
+
+ type: Literal["text"] = "text"
+ text: str
+
+
class BaseMessage(BaseModel):
- content: str
+ """Base schema for messages"""
+
+ content: Union[str, list[Union[MessageImageURLContent, MessageTextContent]]]
role: str
@@ -103,9 +126,33 @@ class ChatRequest(Prompt[T]):
user: Optional[str] = None
+class VisionRequest(BaseModel):
+ messages: list[BaseMessage] = Field(default_factory=list)
+ model: str = Field(default_factory=lambda: settings.openai.chat.vision.model)
+ logit_bias: Optional[LogitBias] = None
+ max_tokens: Optional[Annotated[int, Field(strict=True, ge=1)]] = Field(
+ default_factory=lambda: settings.openai.chat.vision.max_tokens
+ )
+ frequency_penalty: Optional[
+ Annotated[float, Field(strict=True, ge=-2.0, le=2.0)]
+ ] = 0
+ n: Optional[Annotated[int, Field(strict=True, ge=1)]] = 1
+ presence_penalty: Optional[
+ Annotated[float, Field(strict=True, ge=-2.0, le=2.0)]
+ ] = 0
+ seed: Optional[int] = None
+ stop: Optional[Union[str, list[str]]] = None
+ stream: Optional[bool] = False
+ temperature: Optional[Annotated[float, Field(strict=True, ge=0, le=2)]] = Field(
+ default_factory=lambda: settings.openai.chat.vision.temperature
+ )
+ top_p: Optional[Annotated[float, Field(strict=True, ge=0, le=1)]] = 1
+ user: Optional[str] = None
+
+
class ChatResponse(BaseModel):
model_config = dict(arbitrary_types_allowed=True)
- request: ChatRequest
+ request: Union[ChatRequest, VisionRequest]
response: ChatCompletion
tool_outputs: list[Any] = []
@@ -113,6 +160,7 @@ class ChatResponse(BaseModel):
class ImageRequest(BaseModel):
prompt: str
model: Optional[str] = Field(default_factory=lambda: settings.openai.images.model)
+
n: Optional[int] = 1
quality: Optional[Literal["standard", "hd"]] = Field(
default_factory=lambda: settings.openai.images.quality
diff --git a/src/marvin/utilities/images.py b/src/marvin/utilities/images.py
new file mode 100644
index 000000000..94c38647d
--- /dev/null
+++ b/src/marvin/utilities/images.py
@@ -0,0 +1,18 @@
+import base64
+from pathlib import Path
+from typing import Union
+
+
+def image_to_base64(image_path: Union[str, Path]) -> str:
+ """
+ Converts a local image file to a base64 string.
+
+ Args:
+ image_path (Union[str, Path]): The path to the image file. This can be a
+ string or a Path object.
+
+ Returns:
+ str: The base64 representation of the image.
+ """
+ with open(image_path, "rb") as image_file:
+ return base64.b64encode(image_file.read()).decode("utf-8")
diff --git a/tests/apis/test_extract.py b/tests/apis/test_extract.py
index cab6e6d6e..168fceb36 100644
--- a/tests/apis/test_extract.py
+++ b/tests/apis/test_extract.py
@@ -44,7 +44,11 @@ def test_extract_names(self):
)
assert result == ["John", "Mary", "Bob"]
+ @pytest.mark.flaky(max_runs=3)
def test_float_to_int(self):
+ # gpt 3.5 sometimes struggles with this test, marked as flaky
+ # pydantic no longer casts floats to ints, but gpt-3.5 assumes it's
+ # ok even when given instructions not to. GPT-4 seems to work ok.
result = marvin.extract("the numbers are 1, 2, and 3.2", int)
assert result == [1, 2, 3]