Skip to content

Commit

Permalink
google anthropic vlms
Browse files Browse the repository at this point in the history
  • Loading branch information
kyleclo committed Dec 13, 2024
1 parent f12669c commit 0d0b565
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 1 deletion.
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ dependencies = [
"numpy",
"matplotlib",
"pandas",
"python-dotenv"
"python-dotenv",
"openai",
"google-generativeai",
"anthropic"
]
license = {file = "LICENSE"}

Expand Down
82 changes: 82 additions & 0 deletions scripts/vlm_generations/anthropic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Generate text from an image using Anthropic's Generative AI API.
@imaslarab
"""

import sys

sys.path.append("..")

import base64

import anthropic
from dotenv import dotenv_values
from PIL import Image


class AnthropicImageToText:
def __init__(self):
config = dotenv_values("../.env")
api_key = config.get("ANTHROPIC_API_KEY")
self.model = "claude-3-5-sonnet-20240620"

self.client = anthropic.Anthropic(api_key=api_key)

def encode_image(self, image_path):
"""Encode the image in base64 format."""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")

def get_image_format(self, image_path):
image = Image.open(image_path)
image_format = ("image/" + image.format).lower()

return image_format

def get_response(self, image_path, system_prompt, user_prompt=None):
"""Send a chat completion request with the image input."""

user_prompt = user_prompt or "Describe the image."
encoded_image = self.encode_image(image_path)
image_media_type = self.get_image_format(image_path)

response = self.client.messages.create(
model=self.model,
max_tokens=256,
system=system_prompt,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": user_prompt},
{
"type": "image",
"source": {
"type": "base64",
"media_type": image_media_type,
"data": encoded_image,
},
},
],
}
],
)

return response.content[0].text


if __name__ == "__main__":
image_path = "/path/to/images/0f56fb1b-c5f6-405a-b97d-5adb1bd758b8.jpeg"
prompt = """
You are a teacher and are given a student’s handwritten work in an image format. The handwritten work is a response
to a problem. Write a description for this image to explain everything in the image of the student’s handwritten work
in as much detail as you can so that another teacher can understand and reconstruct the math work in this image without
viewing the image. Focus on describing the student’s answers in the image. Your response should be a paragraph without bullet points.
"""

anthropic_api = AnthropicImageToText()
response = anthropic_api.get_response(image_path, prompt)
print(response)
52 changes: 52 additions & 0 deletions scripts/vlm_generations/google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
Generate text from an image using Google's Generative AI API.
@imaslarab
"""

import sys

sys.path.append("..")

import base64

import google.generativeai as genai
from dotenv import dotenv_values
from PIL import Image


class GoogleAIImageToText:
def __init__(self):
config = dotenv_values("../.env")
api_key = config.get("GEMINI_API_KEY")
# self.model = "gemini-1.5-flash"
self.model = "gemini-1.5-pro"

genai.configure(api_key=api_key)
self.client = genai.GenerativeModel(self.model)

def get_response(self, image_path, system_prompt, user_prompt=None):
"""Send a chat completion request with the image input."""

user_prompt = user_prompt or "Describe the image."
image = Image.open(image_path)

response = self.client.generate_content([system_prompt + " " + user_prompt, image])

return response.text


if __name__ == "__main__":
image_path = "/path/to/images/0f56fb1b-c5f6-405a-b97d-5adb1bd758b8.jpeg"
prompt = """
You are a teacher and are given a student’s handwritten work in an image format. The handwritten work is a response
to a problem. Write a description for this image to explain everything in the image of the student’s handwritten work
in as much detail as you can so that another teacher can understand and reconstruct the math work in this image without
viewing the image. Focus on describing the student’s answers in the image. Your response should be a paragraph without bullet points.
"""

anthropic_api = GoogleAIImageToText()
response = anthropic_api.get_response(image_path, prompt)
print(response)

0 comments on commit 0d0b565

Please sign in to comment.