browser-use/browser_use/llm/tests/test_gemini_image.py

import asyncio
import base64
import io
import random

from PIL import Image, ImageDraw, ImageFont

from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.google.serializer import GoogleMessageSerializer
from browser_use.llm.messages import (
	BaseMessage,
	ContentPartImageParam,
	ContentPartTextParam,
	ImageURL,
	SystemMessage,
	UserMessage,
)


def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str:
	# Create image with random background color
	bg_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
	image = Image.new('RGB', (width, height), bg_color)
	draw = ImageDraw.Draw(image)

	# Try to use a default font, fallback to default if not available
	try:
		font = ImageFont.truetype('arial.ttf', 24)
	except Exception:
		font = ImageFont.load_default()

	# Calculate text position to center it
	bbox = draw.textbbox((0, 0), text, font=font)
	text_width = bbox[2] - bbox[0]
	text_height = bbox[3] - bbox[1]
	x = (width - text_width) // 2
	y = (height - text_height) // 2

	# Draw text with contrasting color
	text_color = (255 - bg_color[0], 255 - bg_color[1], 255 - bg_color[2])
	draw.text((x, y), text, fill=text_color, font=font)

	# Convert to base64
	buffer = io.BytesIO()
	image.save(buffer, format='JPEG')
	img_data = base64.b64encode(buffer.getvalue()).decode()

	return f'data:image/jpeg;base64,{img_data}'


async def test_gemini_image_vision():
	"""Test Gemini's ability to see and describe images."""

	# Create the LLM
	llm = ChatGoogle(model='gemini-2.0-flash-exp')

	# Create a random image with text
	image_data_url = create_random_text_image('Hello Gemini! Can you see this text?')

	# Create messages with image
	messages: list[BaseMessage] = [
		SystemMessage(content='You are a helpful assistant that can see and describe images.'),
		UserMessage(
			content=[
				ContentPartTextParam(text='What do you see in this image? Please describe the text and any visual elements.'),
				ContentPartImageParam(image_url=ImageURL(url=image_data_url)),
			]
		),
	]

	# Serialize messages for Google format
	serializer = GoogleMessageSerializer()
	formatted_messages, system_message = serializer.serialize_messages(messages)

	print('Testing Gemini image vision...')
	print(f'System message: {system_message}')

	# Make the API call
	try:
		response = await llm.ainvoke(messages)
		print('\n=== Gemini Response ===')
		print(response.completion)
		print(response.usage)
		print('=======================')
	except Exception as e:
		print(f'Error calling Gemini: {e}')
		print(f'Error type: {type(e)}')


if __name__ == '__main__':
	asyncio.run(test_gemini_image_vision())