custom-functions examples

2026-05-06 17:52:15 +02:00 · 2025-08-18 18:51:58 -07:00
parent 4f5932b5a8
commit 1d34d2ed22
10 changed files with 39 additions and 1138 deletions
--- a/examples/custom-functions/2fa.py
+++ b/examples/custom-functions/2fa.py
@@ -56,7 +56,7 @@ async def main():
 	You are completely FORBIDDEN to use any other method to get the 2FA code.
 	"""

-	model = ChatOpenAI(model='gpt-4.1')
+	model = ChatOpenAI(model='gpt-4.1-mini')
 	agent = Agent(task=task, llm=model, controller=controller)

 	result = await agent.run()
--- a/examples/custom-functions/clipboard.py
+++ b/examples/custom-functions/clipboard.py
@@ -1,59 +0,0 @@
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import pyperclip
-
-from browser_use import Agent, Controller
-from browser_use.agent.views import ActionResult
-from browser_use.browser import BrowserProfile, BrowserSession
-from browser_use.browser.types import Page
-from browser_use.llm import ChatOpenAI
-
-browser_profile = BrowserProfile(
-	headless=False,
-)
-controller = Controller()
-
-
-@controller.registry.action('Copy text to clipboard')
-def copy_to_clipboard(text: str):
-	pyperclip.copy(text)
-	return ActionResult(extracted_content=text)
-
-
-@controller.registry.action('Paste text from clipboard')
-async def paste_from_clipboard(page: Page):
-	text = pyperclip.paste()
-	# send text to browser
-	await page.keyboard.type(text)
-
-	return ActionResult(extracted_content=text)
-
-
-async def main():
-	task = 'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text'
-	model = ChatOpenAI(model='gpt-4.1')
-	browser_session = BrowserSession(browser_profile=browser_profile)
-	await browser_session.start()
-	agent = Agent(
-		task=task,
-		llm=model,
-		controller=controller,
-		browser_session=browser_session,
-	)
-
-	await agent.run()
-	await browser_session.kill()
-
-	input('Press Enter to close...')
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/custom-functions/custom_hooks_before_after_step.py
+++ b/examples/custom-functions/custom_hooks_before_after_step.py
@@ -1,235 +0,0 @@
-"""
-Description: These Python modules are designed to capture detailed
-browser usage datafor analysis, with both server and client
-components working together to record and store the information.
-
-Author: Carlos A. Planchón
-https://github.com/carlosplanchon/
-
-Adapt this code to your needs.
-
-Feedback is appreciated!
-"""
-
-#####################
-#                   #
-#   --- UTILS ---   #
-#                   #
-#####################
-
-import base64
-
-
-def b64_to_png(b64_string: str, output_file):
-	"""
-	Convert a Base64-encoded string to a PNG file.
-
-	:param b64_string: A string containing Base64-encoded data
-	:param output_file: The path to the output PNG file
-	"""
-	with open(output_file, 'wb') as f:
-		f.write(base64.b64decode(b64_string))
-
-
-###################################################################
-#                                                                 #
-#   --- FASTAPI API TO RECORD AND SAVE Browser-Use ACTIVITY ---   #
-#                                                                 #
-###################################################################
-
-# Save to api.py and run with `python api.py`
-
-# ! pip install uvicorn
-# ! pip install fastapi
-# ! pip install prettyprinter
-
-import json
-from pathlib import Path
-
-import prettyprinter  # type: ignore
-from fastapi import FastAPI, Request
-
-prettyprinter.install_extras()
-
-app = FastAPI()
-
-
-@app.post('/post_agent_history_step')
-async def post_agent_history_step(request: Request):
-	data = await request.json()
-	prettyprinter.cpprint(data)
-
-	# Ensure the "recordings" folder exists using pathlib
-	recordings_folder = Path('recordings')
-	recordings_folder.mkdir(exist_ok=True)
-
-	# Determine the next file number by examining existing .json files
-	existing_numbers = []
-	for item in recordings_folder.iterdir():
-		if item.is_file() and item.suffix == '.json':
-			try:
-				file_num = int(item.stem)
-				existing_numbers.append(file_num)
-			except ValueError:
-				# In case the file name isn't just a number
-				...
-
-	if existing_numbers:
-		next_number = max(existing_numbers) + 1
-	else:
-		next_number = 1
-
-	# Construct the file path
-	file_path = recordings_folder / f'{next_number}.json'
-
-	# Save the JSON data to the file
-	with file_path.open('w') as f:
-		json.dump(data, f, indent=2)
-
-	return {'status': 'ok', 'message': f'Saved to {file_path}'}
-
-
-if __name__ == '__main__':
-	import uvicorn
-
-	uvicorn.run(app, host='0.0.0.0', port=9000)
-
-
-##############################################################
-#                                                            #
-#   --- CLIENT TO RECORD AND SAVE Browser-Use ACTIVITY ---   #
-#                                                            #
-##############################################################
-
-"""
-pyobjtojson:
-
-A Python library to safely and recursively serialize any Python object
-(including Pydantic models and dataclasses) into JSON-ready structures,
-gracefully handling circular references.
-"""
-
-# ! pip install -U pyobjtojson
-# ! pip install -U prettyprinter
-
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import requests
-from pyobjtojson import obj_to_json  # type: ignore
-
-from browser_use import Agent
-from browser_use.llm import ChatOpenAI
-
-# import prettyprinter
-# prettyprinter.install_extras()
-
-
-def send_agent_history_step(data):
-	url = 'http://127.0.0.1:9000/post_agent_history_step'
-	response = requests.post(url, json=data)
-	return response.json()
-
-
-async def record_activity(agent_obj):
-	website_html = None
-	website_screenshot = None
-	urls_json_last_elem = None
-	model_thoughts_last_elem = None
-	model_outputs_json_last_elem = None
-	model_actions_json_last_elem = None
-	extracted_content_json_last_elem = None
-
-	print('--- ON_STEP_START HOOK ---')
-	website_html = await agent_obj.browser_context.get_page_html()
-	website_screenshot = await agent_obj.browser_context.take_screenshot()
-
-	print('--> History:')
-	# Assert agent has state to satisfy type checker
-	assert hasattr(agent_obj, 'state'), 'Agent must have state attribute'
-	history = agent_obj.history
-
-	model_thoughts = obj_to_json(obj=history.model_thoughts(), check_circular=False)
-
-	# print("--- MODEL THOUGHTS ---")
-	if len(model_thoughts) > 0:
-		model_thoughts_last_elem = model_thoughts[-1]
-		# prettyprinter.cpprint(model_thoughts_last_elem)
-
-	# print("--- MODEL OUTPUT ACTION ---")
-	model_outputs = agent_obj.history.model_outputs()
-	model_outputs_json = obj_to_json(obj=model_outputs, check_circular=False)
-
-	if len(model_outputs_json) > 0:
-		model_outputs_json_last_elem = model_outputs_json[-1]
-		# prettyprinter.cpprint(model_outputs_json_last_elem)
-
-	# print("--- MODEL INTERACTED ELEM ---")
-	model_actions = agent_obj.history.model_actions()
-	model_actions_json = obj_to_json(obj=model_actions, check_circular=False)
-
-	if len(model_actions_json) > 0:
-		model_actions_json_last_elem = model_actions_json[-1]
-		# prettyprinter.cpprint(model_actions_json_last_elem)
-
-	# print("--- EXTRACTED CONTENT ---")
-	extracted_content = agent_obj.history.extracted_content()
-	extracted_content_json = obj_to_json(obj=extracted_content, check_circular=False)
-	if len(extracted_content_json) > 0:
-		extracted_content_json_last_elem = extracted_content_json[-1]
-		# prettyprinter.cpprint(extracted_content_json_last_elem)
-
-	# print("--- URLS ---")
-	urls = agent_obj.history.urls()
-	# prettyprinter.cpprint(urls)
-	urls_json = obj_to_json(obj=urls, check_circular=False)
-
-	if len(urls_json) > 0:
-		urls_json_last_elem = urls_json[-1]
-		# prettyprinter.cpprint(urls_json_last_elem)
-
-	model_step_summary = {
-		'website_html': website_html,
-		'website_screenshot': website_screenshot,
-		'url': urls_json_last_elem,
-		'model_thoughts': model_thoughts_last_elem,
-		'model_outputs': model_outputs_json_last_elem,
-		'model_actions': model_actions_json_last_elem,
-		'extracted_content': extracted_content_json_last_elem,
-	}
-
-	print('--- MODEL STEP SUMMARY ---')
-	# prettyprinter.cpprint(model_step_summary)
-
-	send_agent_history_step(data=model_step_summary)
-
-	# response = send_agent_history_step(data=history)
-	# print(response)
-
-	# print("--> Website HTML:")
-	# print(website_html[:200])
-	# print("--> Website Screenshot:")
-	# print(website_screenshot[:200])
-
-
-agent = Agent(
-	task='Compare the price of gpt-4.1 and DeepSeek-V3',
-	llm=ChatOpenAI(model='gpt-4.1'),
-)
-
-
-async def run_agent():
-	try:
-		await agent.run(on_step_start=record_activity, max_steps=30)
-	except Exception as e:
-		print(e)
-
-
-asyncio.run(run_agent())
--- a/examples/custom-functions/drag_and_drop.py
+++ b/examples/custom-functions/drag_and_drop.py
@@ -1,320 +0,0 @@
-"""
-Drag and Drop Custom Action Example
-
-This example demonstrates how to implement drag and drop functionality as a custom action.
-The drag and drop action supports both element-based and coordinate-based operations,
-making it useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement.
-"""
-
-import asyncio
-from typing import cast
-
-from pydantic import BaseModel, Field
-
-from browser_use import ActionResult, Agent, Controller
-from browser_use.browser.types import ElementHandle, Page
-from browser_use.llm import ChatOpenAI
-
-
-class Position(BaseModel):
-	"""Represents a position with x and y coordinates."""
-
-	x: int = Field(..., description='X coordinate')
-	y: int = Field(..., description='Y coordinate')
-
-
-class DragDropAction(BaseModel):
-	"""Parameters for drag and drop operations."""
-
-	# Element-based approach
-	element_source: str | None = Field(None, description='CSS selector or XPath for the source element to drag')
-	element_target: str | None = Field(None, description='CSS selector or XPath for the target element to drop on')
-	element_source_offset: Position | None = Field(None, description='Optional offset from source element center (x, y)')
-	element_target_offset: Position | None = Field(None, description='Optional offset from target element center (x, y)')
-
-	# Coordinate-based approach
-	coord_source_x: int | None = Field(None, description='Source X coordinate for drag start')
-	coord_source_y: int | None = Field(None, description='Source Y coordinate for drag start')
-	coord_target_x: int | None = Field(None, description='Target X coordinate for drag end')
-	coord_target_y: int | None = Field(None, description='Target Y coordinate for drag end')
-
-	# Operation parameters
-	steps: int | None = Field(10, description='Number of intermediate steps during drag (default: 10)')
-	delay_ms: int | None = Field(5, description='Delay in milliseconds between steps (default: 5)')
-
-
-async def create_drag_drop_controller() -> Controller:
-	"""Create a controller with drag and drop functionality."""
-	controller = Controller()
-
-	@controller.registry.action(
-		'Drag and drop elements or between coordinates on the page - useful for canvas drawing, sortable lists, sliders, file uploads, and UI rearrangement',
-		param_model=DragDropAction,
-	)
-	async def drag_drop(params: DragDropAction, page: Page) -> ActionResult:
-		"""
-		Performs a precise drag and drop operation between elements or coordinates.
-		"""
-
-		async def get_drag_elements(
-			page: Page,
-			source_selector: str,
-			target_selector: str,
-		) -> tuple[ElementHandle | None, ElementHandle | None]:
-			"""Get source and target elements with appropriate error handling."""
-			source_element = None
-			target_element = None
-
-			try:
-				# page.locator() auto-detects CSS and XPath
-				source_locator = page.locator(source_selector)
-				target_locator = page.locator(target_selector)
-
-				# Check if elements exist
-				source_count = await source_locator.count()
-				target_count = await target_locator.count()
-
-				if source_count > 0:
-					source_element = await source_locator.first.element_handle()
-					print(f'Found source element with selector: {source_selector}')
-				else:
-					print(f'Source element not found: {source_selector}')
-
-				if target_count > 0:
-					target_element = await target_locator.first.element_handle()
-					print(f'Found target element with selector: {target_selector}')
-				else:
-					print(f'Target element not found: {target_selector}')
-
-			except Exception as e:
-				print(f'Error finding elements: {str(e)}')
-
-			return source_element, target_element
-
-		async def get_element_coordinates(
-			source_element: ElementHandle,
-			target_element: ElementHandle,
-			source_position: Position | None,
-			target_position: Position | None,
-		) -> tuple[tuple[int, int] | None, tuple[int, int] | None]:
-			"""Get coordinates from elements with appropriate error handling."""
-			source_coords = None
-			target_coords = None
-
-			try:
-				# Get source coordinates
-				if source_position:
-					source_coords = (source_position.x, source_position.y)
-				else:
-					source_box = await source_element.bounding_box()
-					if source_box:
-						source_coords = (
-							int(source_box['x'] + source_box['width'] / 2),
-							int(source_box['y'] + source_box['height'] / 2),
-						)
-
-				# Get target coordinates
-				if target_position:
-					target_coords = (target_position.x, target_position.y)
-				else:
-					target_box = await target_element.bounding_box()
-					if target_box:
-						target_coords = (
-							int(target_box['x'] + target_box['width'] / 2),
-							int(target_box['y'] + target_box['height'] / 2),
-						)
-			except Exception as e:
-				print(f'Error getting element coordinates: {str(e)}')
-
-			return source_coords, target_coords
-
-		async def execute_drag_operation(
-			page: Page,
-			source_x: int,
-			source_y: int,
-			target_x: int,
-			target_y: int,
-			steps: int,
-			delay_ms: int,
-		) -> tuple[bool, str]:
-			"""Execute the drag operation with comprehensive error handling."""
-			try:
-				# Try to move to source position
-				try:
-					await page.mouse.move(source_x, source_y)
-					print(f'Moved to source position ({source_x}, {source_y})')
-				except Exception as e:
-					print(f'Failed to move to source position: {str(e)}')
-					return False, f'Failed to move to source position: {str(e)}'
-
-				# Press mouse button down
-				await page.mouse.down()
-
-				# Move to target position with intermediate steps
-				for i in range(1, steps + 1):
-					ratio = i / steps
-					intermediate_x = int(source_x + (target_x - source_x) * ratio)
-					intermediate_y = int(source_y + (target_y - source_y) * ratio)
-
-					await page.mouse.move(intermediate_x, intermediate_y)
-
-					if delay_ms > 0:
-						await asyncio.sleep(delay_ms / 1000)
-
-				# Move to final target position
-				await page.mouse.move(target_x, target_y)
-
-				# Move again to ensure dragover events are properly triggered
-				await page.mouse.move(target_x, target_y)
-
-				# Release mouse button
-				await page.mouse.up()
-
-				return True, 'Drag operation completed successfully'
-
-			except Exception as e:
-				return False, f'Error during drag operation: {str(e)}'
-
-		try:
-			# Initialize variables
-			source_x: int | None = None
-			source_y: int | None = None
-			target_x: int | None = None
-			target_y: int | None = None
-
-			# Normalize parameters
-			steps = max(1, params.steps or 10)
-			delay_ms = max(0, params.delay_ms or 5)
-
-			# Case 1: Element selectors provided
-			if params.element_source and params.element_target:
-				print('Using element-based approach with selectors')
-
-				source_element, target_element = await get_drag_elements(
-					page,
-					params.element_source,
-					params.element_target,
-				)
-
-				if not source_element or not target_element:
-					error_msg = f'Failed to find {"source" if not source_element else "target"} element'
-					return ActionResult(error=error_msg, include_in_memory=True)
-
-				source_coords, target_coords = await get_element_coordinates(
-					source_element, target_element, params.element_source_offset, params.element_target_offset
-				)
-
-				if not source_coords or not target_coords:
-					error_msg = f'Failed to determine {"source" if not source_coords else "target"} coordinates'
-					return ActionResult(error=error_msg, include_in_memory=True)
-
-				source_x, source_y = source_coords
-				target_x, target_y = target_coords
-
-			# Case 2: Coordinates provided directly
-			elif all(
-				coord is not None
-				for coord in [params.coord_source_x, params.coord_source_y, params.coord_target_x, params.coord_target_y]
-			):
-				print('Using coordinate-based approach')
-				source_x = params.coord_source_x
-				source_y = params.coord_source_y
-				target_x = params.coord_target_x
-				target_y = params.coord_target_y
-			else:
-				error_msg = 'Must provide either source/target selectors or source/target coordinates'
-				return ActionResult(error=error_msg, include_in_memory=True)
-
-			# Validate coordinates
-			if any(coord is None for coord in [source_x, source_y, target_x, target_y]):
-				error_msg = 'Failed to determine source or target coordinates'
-				return ActionResult(error=error_msg, include_in_memory=True)
-
-			# Perform the drag operation
-			success, message = await execute_drag_operation(
-				page,
-				cast(int, source_x),
-				cast(int, source_y),
-				cast(int, target_x),
-				cast(int, target_y),
-				steps,
-				delay_ms,
-			)
-
-			if not success:
-				print(f'Drag operation failed: {message}')
-				return ActionResult(error=message, include_in_memory=True)
-
-			# Create descriptive message
-			if params.element_source and params.element_target:
-				msg = f"🖱️ Dragged element '{params.element_source}' to '{params.element_target}'"
-			else:
-				msg = f'🖱️ Dragged from ({source_x}, {source_y}) to ({target_x}, {target_y})'
-
-			print(msg)
-			return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=msg)
-
-		except Exception as e:
-			error_msg = f'Failed to perform drag and drop: {str(e)}'
-			print(error_msg)
-			return ActionResult(error=error_msg, include_in_memory=True)
-
-	return controller
-
-
-async def example_drag_drop_sortable_list():
-	"""Example: Drag and drop to reorder items in a sortable list."""
-
-	controller = await create_drag_drop_controller()
-
-	# Initialize LLM (replace with your preferred model)
-	llm = ChatOpenAI(model='gpt-4.1')
-
-	# Create the agent
-	agent = Agent(
-		task='Go to a drag and drop demo website and reorder some list items using drag and drop',
-		llm=llm,
-		controller=controller,
-	)
-
-	# Run the agent
-	print('🚀 Starting drag and drop example...')
-	history = await agent.run()
-
-	return history
-
-
-async def example_drag_drop_coordinates():
-	"""Example: Direct coordinate-based drag and drop."""
-
-	controller = await create_drag_drop_controller()
-	llm = ChatOpenAI(model='gpt-4.1')
-
-	agent = Agent(
-		task='Go to a canvas drawing website and draw a simple line using drag and drop from coordinates (100, 100) to (300, 200)',
-		llm=llm,
-		controller=controller,
-	)
-
-	print('🎨 Starting coordinate-based drag and drop example...')
-	history = await agent.run()
-
-	return history
-
-
-if __name__ == '__main__':
-	# Run different examples
-	print('Choose an example:')
-	print('1. Sortable list drag and drop')
-	print('2. Coordinate-based drawing')
-
-	choice = input('Enter choice (1-3): ').strip()
-
-	if choice == '1':
-		asyncio.run(example_drag_drop_sortable_list())
-	elif choice == '2':
-		asyncio.run(example_drag_drop_coordinates())
-
-	else:
-		print('Invalid choice, running sortable list example...')
-		asyncio.run(example_drag_drop_sortable_list())
--- a/examples/custom-functions/extract_pdf_content.py
+++ b/examples/custom-functions/extract_pdf_content.py
@@ -1,90 +0,0 @@
-#!/usr/bin/env -S uv run --script
-# /// script
-# requires-python = ">=3.11"
-# dependencies = ["browser-use", "mistralai"]
-# ///
-
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import asyncio
-import logging
-
-from mistralai import Mistral  # type: ignore
-from pydantic import BaseModel, Field
-
-from browser_use import Agent, Controller
-from browser_use.agent.views import ActionResult
-from browser_use.browser import BrowserSession
-from browser_use.llm import ChatOpenAI
-
-if not os.getenv('OPENAI_API_KEY'):
-	raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
-
-if not os.getenv('MISTRAL_API_KEY'):
-	raise ValueError('MISTRAL_API_KEY is not set. Please add it to your environment variables.')
-
-logger = logging.getLogger(__name__)
-
-controller = Controller()
-
-
-class PdfExtractParams(BaseModel):
-	url: str = Field(description='URL to a PDF document')
-
-
-@controller.registry.action(
-	'Extract PDF Text',
-	param_model=PdfExtractParams,
-)
-def extract_mistral_ocr(params: PdfExtractParams, browser: BrowserSession) -> ActionResult:
-	"""
-	Process a PDF URL using Mistral OCR API and return the OCR response.
-
-	Args:
-	    url: URL to a PDF document
-
-	Returns:
-	    OCR response object from Mistral API
-	"""
-	api_key = os.getenv('MISTRAL_API_KEY')
-	client = Mistral(api_key=api_key)
-
-	response = client.ocr.process(
-		model='mistral-ocr-latest',
-		document={
-			'type': 'document_url',
-			'document_url': params.url,
-		},
-		include_image_base64=False,
-	)
-
-	markdown = '\n\n'.join(f'### Page {i + 1}\n{response.pages[i].markdown}' for i in range(len(response.pages)))
-	return ActionResult(
-		extracted_content=markdown,
-		include_in_memory=False,  ## PDF content can be very large, so we don't include it in memory
-	)
-
-
-async def main():
-	agent = Agent(
-		task="""
-        Objective: Navigate to the following URL, extract its contents using the Extract PDF Text action, and explain its historical significance.
-
-        URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
-        """,
-		llm=ChatOpenAI(model='gpt-4.1'),
-		controller=controller,
-	)
-	result = await agent.run()
-	logger.info(result)
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/custom-functions/hover_element.py
+++ b/examples/custom-functions/hover_element.py
@@ -1,167 +0,0 @@
-"""
-Example of implementing hover functionality for elements.
-
-This shows how to hover over elements to trigger hover states and tooltips.
-"""
-
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-from pydantic import BaseModel
-
-load_dotenv()
-
-from browser_use.agent.service import Agent, Controller
-from browser_use.agent.views import ActionResult
-from browser_use.browser import BrowserSession
-from browser_use.llm import ChatOpenAI
-
-# Initialize controller
-controller = Controller()
-
-
-class HoverAction(BaseModel):
-	"""Parameters for hover action"""
-
-	index: int | None = None
-	xpath: str | None = None
-	selector: str | None = None
-
-
-@controller.registry.action(
-	'Hover over an element',
-	param_model=HoverAction,  # Define this model with at least "index: int" field
-)
-async def hover_element(params: HoverAction, browser_session: BrowserSession):
-	"""
-	Hovers over the element specified by its index from the cached selector map or by XPath.
-	"""
-	try:
-		element_node = None
-
-		if params.xpath:
-			# Find element by XPath using CDP
-			cdp_session = await browser_session.get_or_create_cdp_session()
-			result = await cdp_session.cdp_client.send.Runtime.evaluate(
-				params={
-					'expression': f"""
-						(() => {{
-							const element = document.evaluate('{params.xpath}', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
-							if (element) {{
-								const rect = element.getBoundingClientRect();
-								return {{found: true, x: rect.x + rect.width/2, y: rect.y + rect.height/2}};
-							}}
-							return {{found: false}};
-						}})()
-					""",
-					'returnByValue': True,
-				},
-				session_id=cdp_session.session_id,
-			)
-			element_info = result.get('result', {}).get('value', {})
-			if not element_info.get('found'):
-				raise Exception(f'Failed to locate element with XPath {params.xpath}')
-			x, y = element_info['x'], element_info['y']
-
-		elif params.selector:
-			# Find element by CSS selector using CDP
-			cdp_session = await browser_session.get_or_create_cdp_session()
-			result = await cdp_session.cdp_client.send.Runtime.evaluate(
-				params={
-					'expression': f"""
-						(() => {{
-							const element = document.querySelector('{params.selector}');
-							if (element) {{
-								const rect = element.getBoundingClientRect();
-								return {{found: true, x: rect.x + rect.width/2, y: rect.y + rect.height/2}};
-							}}
-							return {{found: false}};
-						}})()
-					""",
-					'returnByValue': True,
-				},
-				session_id=cdp_session.session_id,
-			)
-			element_info = result.get('result', {}).get('value', {})
-			if not element_info.get('found'):
-				raise Exception(f'Failed to locate element with CSS Selector {params.selector}')
-			x, y = element_info['x'], element_info['y']
-
-		elif params.index is not None:
-			# Use index to locate the element
-			selector_map = await browser_session.get_selector_map()
-			if params.index not in selector_map:
-				raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
-			element_node = selector_map[params.index]
-
-			# Get element position
-			if not element_node.absolute_position:
-				raise Exception(f'Element at index {params.index} has no position information')
-
-			x = element_node.absolute_position.x + element_node.absolute_position.width / 2
-			y = element_node.absolute_position.y + element_node.absolute_position.height / 2
-
-		else:
-			raise Exception('Either index, xpath, or selector must be provided')
-
-		# Perform hover using CDP mouse events
-		cdp_session = await browser_session.get_or_create_cdp_session()
-
-		# Move mouse to the element position
-		await cdp_session.cdp_client.send.Input.dispatchMouseEvent(
-			params={
-				'type': 'mouseMoved',
-				'x': x,
-				'y': y,
-			},
-			session_id=cdp_session.session_id,
-		)
-
-		# Wait a bit for hover state to trigger
-		await asyncio.sleep(0.1)
-
-		msg = (
-			f'🖱️ Hovered over element at index {params.index}'
-			if params.index is not None
-			else f'🖱️ Hovered over element with XPath {params.xpath}'
-			if params.xpath
-			else f'🖱️ Hovered over element with selector {params.selector}'
-		)
-		return ActionResult(extracted_content=msg, include_in_memory=True)
-
-	except Exception as e:
-		error_msg = f'❌ Failed to hover over element: {str(e)}'
-		return ActionResult(error=error_msg)
-
-
-async def main():
-	"""Main function to run the example"""
-	browser_session = BrowserSession()
-	await browser_session.start()
-	llm = ChatOpenAI(model='gpt-4.1')
-
-	# Create the agent with hover capability
-	agent = Agent(
-		task="""
-            Go to a website with hover interactions, like https://www.w3schools.com/howto/howto_css_dropdown.asp
-            Try hovering over the dropdown menu to see the dropdown items appear.
-            Then describe what happens when you hover.
-        """,
-		llm=llm,
-		browser_session=browser_session,
-		controller=controller,
-	)
-
-	# Run the agent
-	await agent.run(max_steps=10)
-
-	# Cleanup
-	await browser_session.kill()
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/custom-functions/perplexity_search.py
+++ b/examples/custom-functions/perplexity_search.py
@@ -1,91 +0,0 @@
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-import logging
-
-from pydantic import BaseModel
-
-from browser_use import ActionResult, Agent, Controller
-from browser_use.browser.profile import BrowserProfile
-from browser_use.llm import ChatOpenAI
-
-logger = logging.getLogger(__name__)
-
-
-class Person(BaseModel):
-	name: str
-	email: str | None = None
-
-
-class PersonList(BaseModel):
-	people: list[Person]
-
-
-PERPLEXITY_API_KEY = os.getenv('PERPLEXITY_API_KEY')
-if not PERPLEXITY_API_KEY:
-	raise ValueError('PERPLEXITY_API_KEY is not set')
-
-controller = Controller(exclude_actions=['search_google'], output_model=PersonList)
-
-
-@controller.registry.action('Search the web for a specific query with perplexity')
-async def search_web(query: str):
-	import httpx
-
-	url = 'https://api.perplexity.ai/chat/completions'
-
-	payload = {
-		'model': 'sonar',
-		'messages': [
-			{'role': 'system', 'content': 'Be precise and concise.'},
-			{'role': 'user', 'content': query},
-		],
-	}
-	headers = {'Authorization': f'Bearer {PERPLEXITY_API_KEY}', 'Content-Type': 'application/json'}
-
-	async with httpx.AsyncClient() as client:
-		response = await client.post(url, json=payload, headers=headers)
-		response.raise_for_status()
-		response_json = response.json()
-		content = response_json['choices'][0]['message']['content']
-		citations = response_json['citations']
-		output = f'{content}\n\nCitations:\n' + '\n'.join(citations)
-		logger.info(output)
-		return ActionResult(extracted_content=output, include_in_memory=True)
-
-
-names = [
-	'Ruedi Aebersold',
-	'Bernd Bodenmiller',
-	'Eugene Demler',
-]
-
-
-async def main():
-	task = 'use search_web with "find email address of the following ETH professor:" for each of the persons. Finally return the list with name and email if provided '
-	task += '\n' + '\n'.join(names)
-	model = ChatOpenAI(model='gpt-4.1')
-	browser_profile = BrowserProfile()
-	agent = Agent(task=task, llm=model, controller=controller, browser_profile=browser_profile)
-
-	history = await agent.run()
-
-	result = history.final_result()
-	if result:
-		parsed: PersonList = PersonList.model_validate_json(result)
-
-		for person in parsed.people:
-			print(f'{person.name} - {person.email}')
-	else:
-		print('No result')
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/custom-functions/save_pdf.py
+++ b/examples/custom-functions/save_pdf.py
@@ -1,56 +0,0 @@
-import asyncio
-import os
-import re
-import sys
-from pathlib import Path
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-from browser_use import ActionResult, Agent, Controller
-from browser_use.browser.types import Page
-from browser_use.llm import ChatOpenAI
-
-# Initialize controller
-controller = Controller()
-
-download_path = Path.cwd() / 'downloads'
-download_path.mkdir(parents=True, exist_ok=True)
-
-
-# Save PDF - exact copy from original controller function
-@controller.registry.action('Save the current page as a PDF file')
-async def save_pdf(page: Page):
-	short_url = re.sub(r'^https?://(?:www\.)?|/$', '', page.url)
-	slug = re.sub(r'[^a-zA-Z0-9]+', '-', short_url).strip('-').lower()
-	sanitized_filename = f'{slug}.pdf'
-
-	await page.emulate_media(media='screen')
-	await page.pdf(path=download_path / sanitized_filename, format='A4', print_background=False)
-	msg = f'Saving page with URL {page.url} as PDF to {download_path / sanitized_filename}'
-	return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory=f'Saved PDF to {sanitized_filename}')
-
-
-async def main():
-	"""
-	Example task: Navigate to browser-use.com and save the page as a PDF
-	"""
-	task = """
-	Go to https://browser-use.com/ and save the page as a PDF file.
-	"""
-
-	# Initialize the language model
-	model = ChatOpenAI(model='gpt-4.1-mini')
-
-	# Create and run the agent
-	agent = Agent(task=task, llm=model, controller=controller)
-
-	result = await agent.run()
-	print(f'🎯 Task completed: {result}')
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/custom-functions/solve_amazon_captcha.py
+++ b/examples/custom-functions/solve_amazon_captcha.py
@@ -1,119 +0,0 @@
-import asyncio
-import os
-import sys
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
-
-from dotenv import load_dotenv
-
-load_dotenv()
-
-from amazoncaptcha import AmazonCaptcha  # type: ignore
-
-from browser_use import ActionResult
-from browser_use.agent.service import Agent
-from browser_use.browser import BrowserConfig, BrowserSession
-from browser_use.controller.service import Controller
-from browser_use.llm import ChatOpenAI
-
-browser_profile = BrowserConfig(headless=False)
-
-# Initialize controller first
-controller = Controller()
-
-
-@controller.action(
-	'Solve Amazon text based captcha',
-	domains=[
-		'*.amazon.com',
-		'*.amazon.co.uk',
-		'*.amazon.ca',
-		'*.amazon.de',
-		'*.amazon.es',
-		'*.amazon.fr',
-		'*.amazon.it',
-		'*.amazon.co.jp',
-		'*.amazon.in',
-		'*.amazon.cn',
-		'*.amazon.com.sg',
-		'*.amazon.com.mx',
-		'*.amazon.ae',
-		'*.amazon.com.br',
-		'*.amazon.nl',
-		'*.amazon.com.au',
-		'*.amazon.com.tr',
-		'*.amazon.sa',
-		'*.amazon.se',
-		'*.amazon.pl',
-	],
-)
-async def solve_amazon_captcha(browser_session: BrowserSession):
-	if not browser_session.agent_focus:
-		raise ValueError('No active browser session')
-
-	# Find the captcha image and extract its src using CDP
-	result = await browser_session.agent_focus.cdp_client.send.Runtime.evaluate(
-		params={
-			'expression': """
-				const img = document.querySelector('img[src*="amazon.com/captcha"]');
-				img ? img.src : null;
-			""",
-			'returnByValue': True,
-		},
-		session_id=browser_session.agent_focus.session_id,
-	)
-	link = result.get('result', {}).get('value')
-
-	if not link:
-		raise ValueError('Could not find captcha image on the page')
-
-	captcha = AmazonCaptcha.fromlink(link)
-	solution = captcha.solve()
-	if not solution or solution == 'Not solved':
-		raise ValueError('Captcha could not be solved')
-
-	# Fill the captcha solution using CDP
-	await browser_session.agent_focus.cdp_client.send.Runtime.evaluate(
-		params={
-			'expression': f"""
-				const input = document.querySelector('#captchacharacters');
-				if (input) {{
-					input.value = '{solution}';
-					input.dispatchEvent(new Event('input', {{ bubbles: true }}));
-					input.dispatchEvent(new Event('change', {{ bubbles: true }}));
-				}}
-			""",
-		},
-		session_id=browser_session.agent_focus.session_id,
-	)
-
-	# Click submit button using CDP
-	await browser_session.agent_focus.cdp_client.send.Runtime.evaluate(
-		params={
-			'expression': """
-				const button = document.querySelector('button[type="submit"]');
-				if (button) button.click();
-			""",
-		},
-		session_id=browser_session.agent_focus.session_id,
-	)
-
-	return ActionResult(extracted_content=solution)
-
-
-async def main():
-	task = 'Go to https://www.amazon.com/errors/validateCaptcha and solve the captcha using the solve_amazon_captcha tool'
-
-	model = ChatOpenAI(model='gpt-4.1')
-	browser_session = BrowserSession(browser_profile=browser_profile)
-	await browser_session.start()
-	agent = Agent(task=task, llm=model, controller=controller, browser_session=browser_session)
-
-	await agent.run()
-	await browser_session.kill()
-
-	input('Press Enter to close...')
-
-
-if __name__ == '__main__':
-	asyncio.run(main())
--- a/examples/use-cases/extract_pdf_content.py
+++ b/examples/use-cases/extract_pdf_content.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = ">=3.11"
+# dependencies = ["browser-use", "mistralai"]
+# ///
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+import asyncio
+import logging
+
+from browser_use import Agent, ChatOpenAI
+
+logger = logging.getLogger(__name__)
+
+
+async def main():
+	agent = Agent(
+		task="""
+        Objective: Navigate to the following URL whats on page 3?
+
+        URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf
+        """,
+		llm=ChatOpenAI(model='gpt-4.1-mini'),
+	)
+	result = await agent.run()
+	logger.info(result)
+
+
+if __name__ == '__main__':
+	asyncio.run(main())