diff --git a/docs/ai/speech.md b/docs/ai/audio/speech.md similarity index 100% rename from docs/ai/speech.md rename to docs/ai/audio/speech.md diff --git a/docs/ai/painting.md b/docs/ai/images/painting.md similarity index 100% rename from docs/ai/painting.md rename to docs/ai/images/painting.md diff --git a/docs/ai/casting.md b/docs/ai/text/casting.md similarity index 100% rename from docs/ai/casting.md rename to docs/ai/text/casting.md diff --git a/docs/ai/classification.md b/docs/ai/text/classification.md similarity index 100% rename from docs/ai/classification.md rename to docs/ai/text/classification.md diff --git a/docs/ai/extraction.md b/docs/ai/text/extraction.md similarity index 100% rename from docs/ai/extraction.md rename to docs/ai/text/extraction.md diff --git a/docs/ai/function.md b/docs/ai/text/function.md similarity index 100% rename from docs/ai/function.md rename to docs/ai/text/function.md diff --git a/docs/ai/generation.md b/docs/ai/text/generation.md similarity index 100% rename from docs/ai/generation.md rename to docs/ai/text/generation.md diff --git a/docs/ai/vision/captioning.md b/docs/ai/vision/captioning.md new file mode 100644 index 000000000..67da84ce6 --- /dev/null +++ b/docs/ai/vision/captioning.md @@ -0,0 +1,41 @@ +# Generating images + +Marvin can use OpenAI's vision API to process images as inputs. + +!!! tip "Beta" + Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change. + +
+

What it does

+

+ The caption function generates text from images. +

+
+ + + +!!! example + + Generate a description of the following image, hypothetically available at `/path/to/marvin.jpg`: + + ![](/assets/images/core/vision/marvin.webp) + + + ```python + import marvin + from pathlib import Path + + marvin.caption(image=Path('/path/to/marvin.jpg')) + ``` + + !!! success "Result" + "This is a digital illustration featuring a stylized, cute character resembling a Funko Pop vinyl figure with large, shiny eyes and a square-shaped head, sitting on abstract wavy shapes that simulate a landscape. The whimsical figure is set against a dark background with sparkling, colorful bokeh effects, giving it a magical, dreamy atmosphere." + + +
+

How it works

+

+ Marvin passes your images to the OpenAI vision API as part of a larger prompt. +

+
+ diff --git a/docs/api_reference/ai/beta/vision.md b/docs/api_reference/ai/beta/vision.md new file mode 100644 index 000000000..196493a7c --- /dev/null +++ b/docs/api_reference/ai/beta/vision.md @@ -0,0 +1,6 @@ +# Vision tools + +!!! tip "Beta" + Please note that vision support in Marvin is still in beta, as OpenAI has not finalized the vision API yet. While it works as expected, it is subject to change. + +::: marvin.ai.beta.vision \ No newline at end of file diff --git a/docs/api_reference/requests.md b/docs/api_reference/requests.md deleted file mode 100644 index 962a508d5..000000000 --- a/docs/api_reference/requests.md +++ /dev/null @@ -1 +0,0 @@ -::: marvin.requests \ No newline at end of file diff --git a/docs/api_reference/types.md b/docs/api_reference/types.md new file mode 100644 index 000000000..c9091398e --- /dev/null +++ b/docs/api_reference/types.md @@ -0,0 +1 @@ +::: marvin.types \ No newline at end of file diff --git a/docs/assets/images/core/vision/marvin.webp b/docs/assets/images/core/vision/marvin.webp new file mode 100644 index 000000000..f40ea7d15 Binary files /dev/null and b/docs/assets/images/core/vision/marvin.webp differ diff --git a/mkdocs.yml b/mkdocs.yml index 3b648c797..3eafe1678 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,18 +15,20 @@ nav: # - Overview: welcome/overview.md - Text and data: - - AI functions: ai/function.md - - Structured data: ai/casting.md - - Entity extraction: ai/extraction.md - - Classification: ai/classification.md - - Generating synthetic data: ai/generation.md + - AI functions: ai/text/function.md + - Structured data: ai/text/casting.md + - Entity extraction: ai/text/extraction.md + - Classification: ai/text/classification.md + - Generating synthetic data: ai/text/generation.md - Images: - - Creating images: ai/painting.md - # - Captioning: ai/function.md + - Creating images: ai/images/painting.md + + - Vision: + - Captioning: ai/vision/captioning.md - Audio: - - Text-to-speech: ai/speech.md + - Text-to-speech: ai/audio/speech.md # - Transcription: ai/function.md - Configuration: @@ -42,8 +44,11 @@ nav: - marvin.ai.images: api_reference/ai/images.md - marvin.ai.audio: api_reference/ai/audio.md + - Beta AI modules: + - marvin.ai.beta.vision: api_reference/ai/beta/vision.md + - Object schemas: - - marvin.requests: api_reference/requests.md + - marvin.types: api_reference/types.md - Settings: - marvin.settings: api_reference/settings.md - Utilities: diff --git a/src/marvin/__init__.py b/src/marvin/__init__.py index 2f7343c34..7ad9fc8fc 100644 --- a/src/marvin/__init__.py +++ b/src/marvin/__init__.py @@ -3,6 +3,7 @@ from .ai.text import fn, cast, extract, classify, classifier, generate, model, Model from .ai.images import paint, image from .ai.audio import speak, speech +from .ai.beta.vision import caption try: from ._version import version as __version__ @@ -26,6 +27,8 @@ # --- audio --- "speak", "speech", + # --- vision (beta) --- + "caption", ] diff --git a/src/marvin/ai/beta/__init__.py b/src/marvin/ai/beta/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/marvin/ai/beta/vision.py b/src/marvin/ai/beta/vision.py new file mode 100644 index 000000000..082b3af36 --- /dev/null +++ b/src/marvin/ai/beta/vision.py @@ -0,0 +1,115 @@ +from pathlib import Path +from typing import ( + TypeVar, + Union, +) + +from pydantic import BaseModel + +import marvin +import marvin.utilities.tools +from marvin.ai.prompts.vision_prompts import CAPTION_PROMPT +from marvin.client.openai import MarvinClient +from marvin.types import ( + BaseMessage, + ChatResponse, + MessageImageURLContent, + VisionRequest, +) +from marvin.utilities.images import image_to_base64 +from marvin.utilities.jinja import Transcript +from marvin.utilities.logging import get_logger + +T = TypeVar("T") +M = TypeVar("M", bound=BaseModel) + +logger = get_logger(__name__) + + +def generate_llm_response( + prompt_template: str, + images: list[Union[str, Path]], + prompt_kwargs: dict = None, + model_kwargs: dict = None, +) -> ChatResponse: + """ + Generates a language model response based on a provided prompt template. + + This function uses a language model to generate a response based on a + provided prompt template. The function supports additional arguments for the + prompt and the language model. + + Args: + prompt_template (str): The template for the prompt. + images (list[Union[str, Path]]): The images to be + used in the prompt. Can be either URLs or local paths. + prompt_kwargs (dict, optional): Additional keyword arguments + for the prompt. Defaults to None. + model_kwargs (dict, optional): Additional keyword arguments + for the language model. Defaults to None. + + Returns: + ChatResponse: The generated response from the language model. + """ + model_kwargs = model_kwargs or {} + prompt_kwargs = prompt_kwargs or {} + messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs) + + if images is not None: + for image in images: + # if images are local paths, convert them to base64. Otherwise + # assume they are URLs + if isinstance(image, Path): + b64_image = image_to_base64(image) + url = f"data:image/jpeg;base64,{b64_image}" + else: + url = image + + messages.append( + BaseMessage( + role="user", + content=[MessageImageURLContent(image_url=dict(url=url))], + ) + ) + + request = VisionRequest(messages=messages, **model_kwargs) + if marvin.settings.log_verbose: + logger.debug_kv("Request", request.model_dump_json(indent=2)) + response = MarvinClient().generate_vision( + **request.model_dump(exclude_none=True, exclude_unset=True) + ) + if marvin.settings.log_verbose: + logger.debug_kv("Response", response.model_dump_json(indent=2)) + return ChatResponse(request=request, response=response) + + +def caption( + image: Union[str, Path], + instructions: str = None, + model_kwargs: dict = None, +) -> str: + """ + Generates a caption for an image. + + This function uses a language model to generate a caption for an image. The + function supports additional arguments for the language model. + + Args: + image (Union[str, Path]): The URL or local path of the + image to be captioned. + instructions (str, optional): Specific instructions for + the caption. Defaults to None. + model_kwargs (dict, optional): Additional keyword + arguments for the language model. Defaults to None. + + Returns: + str: The generated caption. + """ + model_kwargs = model_kwargs or {} + response = generate_llm_response( + prompt_template=CAPTION_PROMPT, + images=[image], + prompt_kwargs=dict(instructions=instructions), + model_kwargs=model_kwargs, + ) + return response.response.choices[0].message.content diff --git a/src/marvin/ai/prompts/vision_prompts.py b/src/marvin/ai/prompts/vision_prompts.py new file mode 100644 index 000000000..bb79ce2b5 --- /dev/null +++ b/src/marvin/ai/prompts/vision_prompts.py @@ -0,0 +1,15 @@ +import inspect + +CAPTION_PROMPT = inspect.cleandoc( + """ + Generate a descriptive caption for the following image, and pay attention to any + additional instructions. Do not respond directly to the user ("you"), as + your response will become the input for other text processing functions. + + {% if instructions -%} + ## Instructions + + {{ instructions }} + {% endif %} + """ +) diff --git a/src/marvin/ai/text.py b/src/marvin/ai/text.py index 1f108f41e..f902f5604 100644 --- a/src/marvin/ai/text.py +++ b/src/marvin/ai/text.py @@ -10,6 +10,7 @@ Callable, GenericAlias, Literal, + Optional, Type, TypeVar, Union, @@ -45,8 +46,8 @@ def generate_llm_response( prompt_template: str, - prompt_kwargs: dict = None, - model_kwargs: dict = None, + prompt_kwargs: Optional[dict] = None, + model_kwargs: Optional[dict] = None, ) -> ChatResponse: """ Generates a language model response based on a provided prompt template. @@ -65,6 +66,7 @@ def generate_llm_response( model_kwargs = model_kwargs or {} prompt_kwargs = prompt_kwargs or {} messages = Transcript(content=prompt_template).render_to_messages(**prompt_kwargs) + request = ChatRequest(messages=messages, **model_kwargs) if marvin.settings.log_verbose: logger.debug_kv("Request", request.model_dump_json(indent=2)) diff --git a/src/marvin/client/openai.py b/src/marvin/client/openai.py index 8421e093a..73becd06b 100644 --- a/src/marvin/client/openai.py +++ b/src/marvin/client/openai.py @@ -18,7 +18,7 @@ import marvin from marvin import settings -from marvin.types import ChatRequest, ImageRequest +from marvin.types import ChatRequest, ImageRequest, VisionRequest if TYPE_CHECKING: from openai._base_client import HttpxBinaryResponseContent @@ -70,7 +70,21 @@ def generate_chat( ) # validate request request = ChatRequest(**kwargs) - response: "ChatCompletion" = create(**request.model_dump()) + response: "ChatCompletion" = create(**request.model_dump(exclude_none=True)) + return response + + def generate_vision( + self, + *, + completion: Optional[Callable[..., "ChatCompletion"]] = None, + **kwargs: Any, + ) -> Union["ChatCompletion", T]: + create: Callable[..., "ChatCompletion"] = ( + completion or self.client.chat.completions.create + ) + # validate request + request = VisionRequest(**kwargs) + response: "ChatCompletion" = create(**request.model_dump(exclude_none=True)) return response def generate_image( @@ -79,7 +93,7 @@ def generate_image( ) -> "ImagesResponse": # validate request request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs) - return self.client.images.generate(**request.model_dump()) + return self.client.images.generate(**request.model_dump(exclude_none=True)) def generate_speech( self, @@ -119,7 +133,23 @@ async def generate_chat( create = self.client.chat.completions.create # validate request request = ChatRequest(**kwargs) - response: "ChatCompletion" = await create(request.model_dump()) + response: "ChatCompletion" = await create(request.model_dump(exclude_none=True)) + return response + + async def generate_vision( + self, + *, + completion: Optional[Callable[..., "ChatCompletion"]] = None, + **kwargs: Any, + ) -> Union["ChatCompletion", T]: + create: Callable[..., "ChatCompletion"] = ( + completion or self.client.chat.completions.create + ) + # validate request + request = VisionRequest(**kwargs) + response: "ChatCompletion" = await create( + **request.model_dump(exclude_none=True) + ) return response async def generate_image( @@ -128,7 +158,9 @@ async def generate_image( ) -> "ImagesResponse": # validate request request = ImageRequest(**marvin.settings.openai.images.model_dump() | kwargs) - return await self.client.images.generate(**request.model_dump()) + return await self.client.images.generate( + **request.model_dump(exclude_none=True) + ) async def generate_audio( self, diff --git a/src/marvin/settings.py b/src/marvin/settings.py index 5d1de248e..85eab6e23 100644 --- a/src/marvin/settings.py +++ b/src/marvin/settings.py @@ -56,8 +56,24 @@ def encoder(self): return tiktoken.encoding_for_model(self.model).encode +class ChatVisionSettings(MarvinSettings): + model_config = SettingsConfigDict(env_prefix="marvin_chat_vision_") + model: str = Field( + description="The default vision model to use.", default="gpt-4-vision-preview" + ) + temperature: float = Field(description="The default temperature to use.", default=1) + max_tokens: int = 500 + + @property + def encoder(self): + import tiktoken + + return tiktoken.encoding_for_model(self.model).encode + + class ChatSettings(MarvinSettings): completions: ChatCompletionSettings = Field(default_factory=ChatCompletionSettings) + vision: ChatVisionSettings = Field(default_factory=ChatVisionSettings) class ImageSettings(MarvinSettings): diff --git a/src/marvin/types.py b/src/marvin/types.py index 10bb35edc..f4389b467 100644 --- a/src/marvin/types.py +++ b/src/marvin/types.py @@ -63,8 +63,31 @@ class FunctionCall(BaseModel): name: str +class ImageUrl(BaseModel): + url: str = Field( + description="URL of the image to be sent or a base64 encoded image." + ) + detail: str = "auto" + + +class MessageImageURLContent(BaseModel): + """Schema for messages containing images""" + + type: Literal["image_url"] = "image_url" + image_url: ImageUrl + + +class MessageTextContent(BaseModel): + """Schema for messages containing text""" + + type: Literal["text"] = "text" + text: str + + class BaseMessage(BaseModel): - content: str + """Base schema for messages""" + + content: Union[str, list[Union[MessageImageURLContent, MessageTextContent]]] role: str @@ -103,9 +126,33 @@ class ChatRequest(Prompt[T]): user: Optional[str] = None +class VisionRequest(BaseModel): + messages: list[BaseMessage] = Field(default_factory=list) + model: str = Field(default_factory=lambda: settings.openai.chat.vision.model) + logit_bias: Optional[LogitBias] = None + max_tokens: Optional[Annotated[int, Field(strict=True, ge=1)]] = Field( + default_factory=lambda: settings.openai.chat.vision.max_tokens + ) + frequency_penalty: Optional[ + Annotated[float, Field(strict=True, ge=-2.0, le=2.0)] + ] = 0 + n: Optional[Annotated[int, Field(strict=True, ge=1)]] = 1 + presence_penalty: Optional[ + Annotated[float, Field(strict=True, ge=-2.0, le=2.0)] + ] = 0 + seed: Optional[int] = None + stop: Optional[Union[str, list[str]]] = None + stream: Optional[bool] = False + temperature: Optional[Annotated[float, Field(strict=True, ge=0, le=2)]] = Field( + default_factory=lambda: settings.openai.chat.vision.temperature + ) + top_p: Optional[Annotated[float, Field(strict=True, ge=0, le=1)]] = 1 + user: Optional[str] = None + + class ChatResponse(BaseModel): model_config = dict(arbitrary_types_allowed=True) - request: ChatRequest + request: Union[ChatRequest, VisionRequest] response: ChatCompletion tool_outputs: list[Any] = [] @@ -113,6 +160,7 @@ class ChatResponse(BaseModel): class ImageRequest(BaseModel): prompt: str model: Optional[str] = Field(default_factory=lambda: settings.openai.images.model) + n: Optional[int] = 1 quality: Optional[Literal["standard", "hd"]] = Field( default_factory=lambda: settings.openai.images.quality diff --git a/src/marvin/utilities/images.py b/src/marvin/utilities/images.py new file mode 100644 index 000000000..94c38647d --- /dev/null +++ b/src/marvin/utilities/images.py @@ -0,0 +1,18 @@ +import base64 +from pathlib import Path +from typing import Union + + +def image_to_base64(image_path: Union[str, Path]) -> str: + """ + Converts a local image file to a base64 string. + + Args: + image_path (Union[str, Path]): The path to the image file. This can be a + string or a Path object. + + Returns: + str: The base64 representation of the image. + """ + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") diff --git a/tests/apis/test_extract.py b/tests/apis/test_extract.py index cab6e6d6e..168fceb36 100644 --- a/tests/apis/test_extract.py +++ b/tests/apis/test_extract.py @@ -44,7 +44,11 @@ def test_extract_names(self): ) assert result == ["John", "Mary", "Bob"] + @pytest.mark.flaky(max_runs=3) def test_float_to_int(self): + # gpt 3.5 sometimes struggles with this test, marked as flaky + # pydantic no longer casts floats to ints, but gpt-3.5 assumes it's + # ok even when given instructions not to. GPT-4 seems to work ok. result = marvin.extract("the numbers are 1, 2, and 3.2", int) assert result == [1, 2, 3]