browser-use/examples/features/add_image_context.py

"""
Show how to use sample_images to add image context for your task
"""

import asyncio
import base64
from pathlib import Path
from typing import Any

from dotenv import load_dotenv

from browser_use import Agent
from browser_use.llm import ChatOpenAI
from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL

# Load environment variables
load_dotenv()


def image_to_base64(image_path: str) -> str:
	"""
	Convert image file to base64 string.

	Args:
	    image_path: Path to the image file

	Returns:
	    Base64 encoded string of the image

	Raises:
	    FileNotFoundError: If image file doesn't exist
	    IOError: If image file cannot be read
	"""
	image_file = Path(image_path)
	if not image_file.exists():
		raise FileNotFoundError(f'Image file not found: {image_path}')

	try:
		with open(image_file, 'rb') as f:
			encoded_string = base64.b64encode(f.read())
			return encoded_string.decode('utf-8')
	except OSError as e:
		raise OSError(f'Failed to read image file: {e}')


def create_sample_images() -> list[ContentPartTextParam | ContentPartImageParam]:
	"""
	Create image context for the agent.

	Returns:
	    list of content parts containing text and image data
	"""
	# Image path - replace with your actual image path
	image_path = 'sample_image.png'

	# Image context configuration
	image_context: list[dict[str, Any]] = [
		{
			'type': 'text',
			'value': (
				'The following image explains the google layout. '
				'The image highlights several buttons with red boxes, '
				'and next to them are corresponding labels in red text.\n'
				'Each label corresponds to a button as follows:\n'
				'Label 1 is the "image" button.'
			),
		},
		{'type': 'image', 'value': image_to_base64(image_path)},
	]

	# Convert to content parts
	content_parts = []
	for item in image_context:
		if item['type'] == 'text':
			content_parts.append(ContentPartTextParam(text=item['value']))
		elif item['type'] == 'image':
			content_parts.append(
				ContentPartImageParam(
					image_url=ImageURL(
						url=f'data:image/jpeg;base64,{item["value"]}',
						media_type='image/jpeg',
					),
				)
			)

	return content_parts


async def main() -> None:
	"""
	Main function to run the browser agent with image context.
	"""
	# Task configuration
	task_str = 'goto https://www.google.com/ and click image button'

	# Initialize the language model
	model = ChatOpenAI(model='gpt-4.1')

	# Create sample images for context
	try:
		sample_images = create_sample_images()
	except (FileNotFoundError, OSError) as e:
		print(f'Error loading sample images: {e}')
		print('Continuing without sample images...')
		sample_images = []

	# Initialize and run the agent
	agent = Agent(task=task_str, llm=model, sample_images=sample_images)
	await agent.run()


if __name__ == '__main__':
	asyncio.run(main())