mirror of
https://github.com/browser-use/browser-use
synced 2026-05-06 17:52:15 +02:00
92 lines
2.6 KiB
Python
92 lines
2.6 KiB
Python
import asyncio
|
|
import base64
|
|
import io
|
|
import random
|
|
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
|
|
from browser_use.llm.google.chat import ChatGoogle
|
|
from browser_use.llm.google.serializer import GoogleMessageSerializer
|
|
from browser_use.llm.messages import (
|
|
BaseMessage,
|
|
ContentPartImageParam,
|
|
ContentPartTextParam,
|
|
ImageURL,
|
|
SystemMessage,
|
|
UserMessage,
|
|
)
|
|
|
|
|
|
def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str:
|
|
# Create image with random background color
|
|
bg_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
|
|
image = Image.new('RGB', (width, height), bg_color)
|
|
draw = ImageDraw.Draw(image)
|
|
|
|
# Try to use a default font, fallback to default if not available
|
|
try:
|
|
font = ImageFont.truetype('arial.ttf', 24)
|
|
except Exception:
|
|
font = ImageFont.load_default()
|
|
|
|
# Calculate text position to center it
|
|
bbox = draw.textbbox((0, 0), text, font=font)
|
|
text_width = bbox[2] - bbox[0]
|
|
text_height = bbox[3] - bbox[1]
|
|
x = (width - text_width) // 2
|
|
y = (height - text_height) // 2
|
|
|
|
# Draw text with contrasting color
|
|
text_color = (255 - bg_color[0], 255 - bg_color[1], 255 - bg_color[2])
|
|
draw.text((x, y), text, fill=text_color, font=font)
|
|
|
|
# Convert to base64
|
|
buffer = io.BytesIO()
|
|
image.save(buffer, format='JPEG')
|
|
img_data = base64.b64encode(buffer.getvalue()).decode()
|
|
|
|
return f'data:image/jpeg;base64,{img_data}'
|
|
|
|
|
|
async def test_gemini_image_vision():
|
|
"""Test Gemini's ability to see and describe images."""
|
|
|
|
# Create the LLM
|
|
llm = ChatGoogle(model='gemini-2.0-flash-exp')
|
|
|
|
# Create a random image with text
|
|
image_data_url = create_random_text_image('Hello Gemini! Can you see this text?')
|
|
|
|
# Create messages with image
|
|
messages: list[BaseMessage] = [
|
|
SystemMessage(content='You are a helpful assistant that can see and describe images.'),
|
|
UserMessage(
|
|
content=[
|
|
ContentPartTextParam(text='What do you see in this image? Please describe the text and any visual elements.'),
|
|
ContentPartImageParam(image_url=ImageURL(url=image_data_url)),
|
|
]
|
|
),
|
|
]
|
|
|
|
# Serialize messages for Google format
|
|
serializer = GoogleMessageSerializer()
|
|
formatted_messages, system_message = serializer.serialize_messages(messages)
|
|
|
|
print('Testing Gemini image vision...')
|
|
print(f'System message: {system_message}')
|
|
|
|
# Make the API call
|
|
try:
|
|
response = await llm.ainvoke(messages)
|
|
print('\n=== Gemini Response ===')
|
|
print(response.completion)
|
|
print(response.usage)
|
|
print('=======================')
|
|
except Exception as e:
|
|
print(f'Error calling Gemini: {e}')
|
|
print(f'Error type: {type(e)}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
asyncio.run(test_gemini_image_vision())
|