Merge commit 'ad68c577c27cafb22270b811fbe9a0b9f4fcd521' into fix-disable-security-bug

This commit is contained in:
Magnus Müller
2025-09-06 15:19:59 -07:00
83 changed files with 2359 additions and 2144 deletions

View File

@@ -33,6 +33,12 @@ ANONYMIZED_TELEMETRY=true
# Default LLM model to use
# OPENAI_API_KEY=your_openai_api_key_here
# ANTHROPIC_API_KEY=your_anthropic_api_key_here
# AZURE_OPENAI_API_KEY=
# AZURE_OPENAI_ENDPOINT=
# GOOGLE_API_KEY=
# DEEPSEEK_API_KEY=
# GROK_API_KEY=
# NOVITA_API_KEY=
# Browser Configuration
# Path to Chrome/Chromium executable (optional)

View File

@@ -31,7 +31,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest, self-hosted]
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.11", "3.13"]
env:
ANONYMIZED_TELEMETRY: 'false'

View File

@@ -21,16 +21,25 @@ on:
jobs:
find_tests:
runs-on: ubuntu-latest
timeout-minutes: 5 # Prevent hanging
outputs:
TEST_FILENAMES: ${{ steps.lsgrep.outputs.TEST_FILENAMES }}
# ["test_browser", "test_tools", "test_browser_session", "test_tab_management", ...]
steps:
- uses: actions/checkout@v4
with:
# Force fresh checkout to avoid any caching issues
fetch-depth: 1
- id: lsgrep
run: |
echo "🔍 Discovering test files at $(date)"
echo "Git commit: $(git rev-parse HEAD)"
echo "Git branch: $(git branch --show-current)"
echo ""
TEST_FILENAMES="$(ls tests/ci/test_*.py | sed 's|^tests/ci/||' | sed 's|\.py$||' | jq -R -s -c 'split("\n")[:-1]')"
echo "TEST_FILENAMES=${TEST_FILENAMES}" >> "$GITHUB_OUTPUT"
echo "$TEST_FILENAMES"
echo "📋 Test matrix: $TEST_FILENAMES"
# https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html
- name: Check that at least one test file is found
run: |
@@ -42,6 +51,7 @@ jobs:
tests:
needs: find_tests
runs-on: ubuntu-latest
timeout-minutes: 10 # Prevent individual tests from hanging
env:
IN_DOCKER: 'True'
ANONYMIZED_TELEMETRY: 'false'
@@ -96,7 +106,20 @@ jobs:
restore-keys: |
${{ runner.os }}-browseruse-extensions-
- run: pytest tests/ci/${{ matrix.test_filename }}.py
- name: Check if test file exists and run it
run: |
TEST_FILE="tests/ci/${{ matrix.test_filename }}.py"
if [ -f "$TEST_FILE" ]; then
echo "✅ Running test file: $TEST_FILE"
pytest "$TEST_FILE"
else
echo "❌ Test file not found: $TEST_FILE"
echo "This file may have been renamed or removed. Current test files:"
ls -1 tests/ci/test_*.py | sed 's|tests/ci/||' | sed 's|\.py$||' | sort
echo ""
echo "Skipping this test job since the file no longer exists."
exit 0 # Exit successfully to not fail the entire workflow
fi
evaluate-tasks:
runs-on: ubuntu-latest

View File

@@ -14,9 +14,34 @@
[![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/intent/user?screen_name=mamagnus00)
[![Weave Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fapp.workweave.ai%2Fapi%2Frepository%2Fbadge%2Forg_T5Pvn3UBswTHIsN1dWS3voPg%2F881458615&labelColor=#EC6341)](https://app.workweave.ai/reports/repository/org_T5Pvn3UBswTHIsN1dWS3voPg/881458615)
<!-- Keep these links. Translations will automatically update with the README. -->
[Deutsch](https://www.readme-i18n.com/browser-use/browser-use?lang=de) |
[Español](https://www.readme-i18n.com/browser-use/browser-use?lang=es) |
[français](https://www.readme-i18n.com/browser-use/browser-use?lang=fr) |
[日本語](https://www.readme-i18n.com/browser-use/browser-use?lang=ja) |
[한국어](https://www.readme-i18n.com/browser-use/browser-use?lang=ko) |
[Português](https://www.readme-i18n.com/browser-use/browser-use?lang=pt) |
[Русский](https://www.readme-i18n.com/browser-use/browser-use?lang=ru) |
[中文](https://www.readme-i18n.com/browser-use/browser-use?lang=zh)
🌤️ Want to skip the setup? Use our <b>[cloud](https://cloud.browser-use.com)</b> for faster, scalable, stealth-enabled browser automation!
# Quick start
## 🎉 OSS Twitter Hackathon
We just hit **69,000 GitHub ⭐**!
To celebrate, we're launching **#nicehack69** — a Twitter-first hackathon with a **$6,900 prize pool**. Dream big and show us the future of browser-use agents that go beyond demos!
**Deadline: September 6, 2025**
**[🚀 Join the hackathon →](https://github.com/browser-use/nicehack69)**
<div align="center">
<a href="https://github.com/browser-use/nicehack69">
<img src="./static/NiceHack69.png" alt="NiceHack69 Hackathon" width="600"/>
</a>
</div>
# Quickstart
With pip (Python>=3.11):

View File

@@ -51,6 +51,7 @@ if TYPE_CHECKING:
from browser_use.browser import BrowserProfile, BrowserSession
from browser_use.browser import BrowserSession as Browser
from browser_use.dom.service import DomService
from browser_use.llm import models
from browser_use.llm.anthropic.chat import ChatAnthropic
from browser_use.llm.azure.chat import ChatAzureOpenAI
from browser_use.llm.google.chat import ChatGoogle
@@ -85,6 +86,8 @@ _LAZY_IMPORTS = {
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'),
# LLM models module
'models': ('browser_use.llm.models', None),
}
@@ -96,7 +99,11 @@ def __getattr__(name: str):
from importlib import import_module
module = import_module(module_path)
attr = getattr(module, attr_name)
if attr_name is None:
# For modules like 'models', return the module itself
attr = module
else:
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
@@ -126,4 +133,6 @@ __all__ = [
'ChatOllama',
'Tools',
'Controller',
# LLM models module
'models',
]

View File

@@ -87,6 +87,8 @@ def create_history_gif(
# Try different font options in order of preference
# ArialUni is a font that comes with Office and can render most non-alphabet characters
font_options = [
'PingFang',
'STHeiti Medium',
'Microsoft YaHei', # 微软雅黑
'SimHei', # 黑体
'SimSun', # 宋体

View File

@@ -17,6 +17,7 @@ from browser_use.browser.views import BrowserStateSummary
from browser_use.filesystem.file_system import FileSystem
from browser_use.llm.messages import (
BaseMessage,
ContentPartImageParam,
ContentPartTextParam,
SystemMessage,
)
@@ -108,6 +109,7 @@ class MessageManager:
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_tool_call_examples: bool = False,
include_recent_events: bool = False,
sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
):
self.task = task
self.state = state
@@ -119,6 +121,7 @@ class MessageManager:
self.vision_detail_level = vision_detail_level
self.include_tool_call_examples = include_tool_call_examples
self.include_recent_events = include_recent_events
self.sample_images = sample_images
assert max_history_items is None or max_history_items > 5, 'max_history_items must be None or greater than 5'
@@ -190,10 +193,10 @@ class MessageManager:
logger.debug(f'Added extracted_content to read_state_description: {action_result.extracted_content}')
if action_result.long_term_memory:
action_results += f'Action {idx + 1}/{result_len}: {action_result.long_term_memory}\n'
action_results += f'{action_result.long_term_memory}\n'
logger.debug(f'Added long_term_memory to action_results: {action_result.long_term_memory}')
elif action_result.extracted_content and not action_result.include_extracted_content_only_once:
action_results += f'Action {idx + 1}/{result_len}: {action_result.extracted_content}\n'
action_results += f'{action_result.extracted_content}\n'
logger.debug(f'Added extracted_content to action_results: {action_result.extracted_content}')
if action_result.error:
@@ -201,13 +204,13 @@ class MessageManager:
error_text = action_result.error[:100] + '......' + action_result.error[-100:]
else:
error_text = action_result.error
action_results += f'Action {idx + 1}/{result_len}: {error_text}\n'
action_results += f'{error_text}\n'
logger.debug(f'Added error to action_results: {error_text}')
self.state.read_state_description = self.state.read_state_description.strip('\n')
if action_results:
action_results = f'Action Results:\n{action_results}'
action_results = f'Result:\n{action_results}'
action_results = action_results.strip('\n') if action_results else None
# Build the history item
@@ -306,6 +309,7 @@ class MessageManager:
screenshots=screenshots,
vision_detail_level=self.vision_detail_level,
include_recent_events=self.include_recent_events,
sample_images=self.sample_images,
).get_user_message(use_vision)
# Set the state message with caching enabled

View File

@@ -32,30 +32,28 @@ class HistoryItem(BaseModel):
def to_string(self) -> str:
"""Get string representation of the history item"""
step_str = f'step_{self.step_number}' if self.step_number is not None else 'step_unknown'
step_str = 'step' if self.step_number is not None else 'step_unknown'
if self.error:
return f"""<{step_str}>
{self.error}
</{step_str}>"""
elif self.system_message:
return f"""<sys>
{self.system_message}
</sys>"""
return '' # empty string
else:
content_parts = []
# Only include evaluation_previous_goal if it's not None/empty
if self.evaluation_previous_goal:
content_parts.append(f'Evaluation of Previous Step: {self.evaluation_previous_goal}')
content_parts.append(f'{self.evaluation_previous_goal}')
# Always include memory
if self.memory:
content_parts.append(f'Memory: {self.memory}')
content_parts.append(f'{self.memory}')
# Only include next_goal if it's not None/empty
if self.next_goal:
content_parts.append(f'Next Goal: {self.next_goal}')
content_parts.append(f'{self.next_goal}')
if self.action_results:
content_parts.append(self.action_results)

View File

@@ -93,6 +93,7 @@ class AgentMessagePrompt:
screenshots: list[str] | None = None,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
include_recent_events: bool = False,
sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
):
self.browser_state: 'BrowserStateSummary' = browser_state_summary
self.file_system: 'FileSystem | None' = file_system
@@ -108,6 +109,7 @@ class AgentMessagePrompt:
self.screenshots = screenshots or []
self.vision_detail_level = vision_detail_level
self.include_recent_events = include_recent_events
self.sample_images = sample_images or []
assert self.browser_state
@observe_debug(ignore_input=True, ignore_output=True, name='_get_browser_state_description')
@@ -132,8 +134,13 @@ class AgentMessagePrompt:
pages_below = pi.pixels_below / pi.viewport_height if pi.viewport_height > 0 else 0
total_pages = pi.page_height / pi.viewport_height if pi.viewport_height > 0 else 0
current_page_position = pi.scroll_y / max(pi.page_height - pi.viewport_height, 1)
page_info_text = f'Page info: {pi.viewport_width}x{pi.viewport_height}px viewport, {pi.page_width}x{pi.page_height}px total page size, {pages_above:.1f} pages above, {pages_below:.1f} pages below, {total_pages:.1f} total pages, at {current_page_position:.0%} of page'
page_info_text = '<page_info>'
page_info_text += f'Viewport size: {pi.viewport_width}x{pi.viewport_height}px, Total page size: {pi.page_width}x{pi.page_height}px, '
page_info_text += f'{pages_above:.1f} pages above, '
page_info_text += f'{pages_below:.1f} pages below, '
page_info_text += f'{total_pages:.1f} total pages'
page_info_text += '</page_info>\n'
# , at {current_page_position:.0%} of page
if elements_text != '':
if has_content_above:
if self.browser_state.page_info:
@@ -187,19 +194,23 @@ class AgentMessagePrompt:
Available tabs:
{tabs_text}
{page_info_text}
{recent_events_text}{pdf_message}Interactive elements from top layer of the current page inside the viewport{truncated_text}:
{recent_events_text}{pdf_message}Elements you can interact with inside the viewport{truncated_text}:
{elements_text}
"""
return browser_state
def _get_agent_state_description(self) -> str:
if self.step_info:
step_info_description = f'Step {self.step_info.step_number + 1} of {self.step_info.max_steps} max possible steps\n'
step_info_description = f'Step {self.step_info.step_number + 1}. Maximum steps: {self.step_info.max_steps}\n'
else:
step_info_description = ''
time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
step_info_description += f'Current date and time: {time_str}'
time_str = datetime.now().strftime('%Y-%m-%d')
step_info_description += f'Current date: {time_str}'
_todo_contents = self.file_system.get_todo_contents() if self.file_system else ''
if not len(_todo_contents):
_todo_contents = '[Current todo.md is empty, fill it with your plan when applicable]'
@@ -240,7 +251,7 @@ Available tabs:
state_description = (
'<agent_history>\n'
+ (self.agent_history_description.strip('\n') if self.agent_history_description else '')
+ '\n</agent_history>\n'
+ '\n</agent_history>\n\n'
)
state_description += '<agent_state>\n' + self._get_agent_state_description().strip('\n') + '\n</agent_state>\n'
state_description += '<browser_state>\n' + self._get_browser_state_description().strip('\n') + '\n</browser_state>\n'
@@ -258,6 +269,9 @@ Available tabs:
# Start with text description
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [ContentPartTextParam(text=state_description)]
# Add sample images
content_parts.extend(self.sample_images)
# Add screenshots with labels
for i, screenshot in enumerate(self.screenshots):
if i == len(self.screenshots) - 1:

View File

@@ -4,7 +4,6 @@ import inspect
import json
import logging
import re
import sys
import tempfile
import time
from collections.abc import Awaitable, Callable
@@ -24,14 +23,14 @@ from browser_use.agent.cloud_events import (
)
from browser_use.agent.message_manager.utils import save_conversation
from browser_use.llm.base import BaseChatModel
from browser_use.llm.messages import BaseMessage, UserMessage
from browser_use.llm.messages import BaseMessage, ContentPartImageParam, ContentPartTextParam, UserMessage
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.tokens.service import TokenCost
load_dotenv()
from bubus import EventBus
from pydantic import ValidationError
from pydantic import BaseModel, ValidationError
from uuid_extensions import uuid7str
from browser_use import Browser, BrowserProfile, BrowserSession
@@ -67,6 +66,7 @@ from browser_use.telemetry.views import AgentTelemetryEvent
from browser_use.tools.registry.views import ActionModel
from browser_use.tools.service import Tools
from browser_use.utils import (
URL_PATTERN,
_log_pretty_path,
get_browser_use_version,
get_git_info,
@@ -128,7 +128,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
def __init__(
self,
task: str,
llm: BaseChatModel = ChatOpenAI(model='gpt-4.1-mini'),
llm: BaseChatModel | None = None,
# Optional parameters
browser_profile: BrowserProfile | None = None,
browser_session: BrowserSession | None = None,
@@ -179,8 +179,28 @@ class Agent(Generic[Context, AgentStructuredOutput]):
step_timeout: int = 120,
directly_open_url: bool = True,
include_recent_events: bool = False,
sample_images: list[ContentPartTextParam | ContentPartImageParam] | None = None,
final_response_after_failure: bool = True,
_url_shortening_limit: int = 25,
**kwargs,
):
if llm is None:
default_llm_name = CONFIG.DEFAULT_LLM
if default_llm_name:
try:
from browser_use.llm.models import get_llm_by_name
llm = get_llm_by_name(default_llm_name)
except (ImportError, ValueError) as e:
# Use the logger that's already imported at the top of the module
logger.warning(
f'Failed to create default LLM "{default_llm_name}": {e}. Falling back to ChatOpenAI(model="gpt-4.1-mini")'
)
llm = ChatOpenAI(model='gpt-4.1-mini')
else:
# No default LLM specified, use the original default
llm = ChatOpenAI(model='gpt-4.1-mini')
if page_extraction_llm is None:
page_extraction_llm = llm
if available_file_paths is None:
@@ -210,6 +230,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.llm = llm
self.directly_open_url = directly_open_url
self.include_recent_events = include_recent_events
self._url_shortening_limit = _url_shortening_limit
if tools is not None:
self.tools = tools
elif controller is not None:
@@ -224,6 +245,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.sensitive_data = sensitive_data
self.sample_images = sample_images
self.settings = AgentSettings(
use_vision=use_vision,
vision_detail_level=vision_detail_level,
@@ -243,6 +266,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
include_tool_call_examples=include_tool_call_examples,
llm_timeout=llm_timeout,
step_timeout=step_timeout,
final_response_after_failure=final_response_after_failure,
)
# Token cost service
@@ -297,7 +321,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning('⚠️ XAI models do not support use_vision=True yet. Setting use_vision=False for now...')
self.settings.use_vision = False
self.logger.info(f'🧠 Starting a browser-use version {self.version} with model={self.llm.model}')
logger.debug(
f'{" +vision" if self.settings.use_vision else ""}'
f' extraction_model={self.settings.page_extraction_llm.model if self.settings.page_extraction_llm else "Unknown"}'
@@ -330,6 +353,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
vision_detail_level=self.settings.vision_detail_level,
include_tool_call_examples=self.settings.include_tool_call_examples,
include_recent_events=self.include_recent_events,
sample_images=self.sample_images,
)
if self.sensitive_data:
@@ -339,23 +363,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# If no allowed_domains are configured, show a security warning
if not self.browser_profile.allowed_domains:
self.logger.error(
'⚠️⚠️⚠️ Agent(sensitive_data=••••••••) was provided but BrowserSession(allowed_domains=[...]) is not locked down! ⚠️⚠️⚠️\n'
'⚠️ Agent(sensitive_data=••••••••) was provided but Browser(allowed_domains=[...]) is not locked down! ⚠️\n'
' ☠️ If the agent visits a malicious website and encounters a prompt-injection attack, your sensitive_data may be exposed!\n\n'
' https://docs.browser-use.com/customize/browser-settings#restrict-urls\n'
'Waiting 10 seconds before continuing... Press [Ctrl+C] to abort.'
)
if sys.stdin.isatty():
try:
time.sleep(10)
except KeyboardInterrupt:
print(
'\n\n 🛑 Exiting now... set BrowserSession(allowed_domains=["example.com", "example.org"]) to only domains you trust to see your sensitive_data.'
)
sys.exit(0)
else:
pass # no point waiting if we're not in an interactive shell
self.logger.warning(
'‼️ Continuing with insecure settings for now... but this will become a hard error in the future!'
' \n'
)
# If we're using domain-specific credentials, validate domain patterns
@@ -426,6 +436,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._last_known_downloads: list[str] = []
self.logger.debug('📁 Initialized download tracking for agent')
# Event-based pause control (kept out of AgentState for serialization)
self._external_pause_event = asyncio.Event()
self._external_pause_event.set()
@@ -606,8 +617,10 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if await self.register_external_agent_status_raise_error_callback():
raise InterruptedError
if self.state.stopped or self.state.paused:
# self.logger.debug('Agent paused after getting state')
if self.state.stopped:
raise InterruptedError
if self.state.paused:
raise InterruptedError
@observe(name='agent.step', ignore_output=True, ignore_input=True)
@@ -615,6 +628,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
async def step(self, step_info: AgentStepInfo | None = None) -> None:
"""Execute one step of the task"""
# Initialize timing first, before any exceptions can occur
self.step_start_time = time.time()
browser_state_summary = None
@@ -682,7 +696,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
available_file_paths=self.available_file_paths, # Always pass current available_file_paths
)
await self._handle_final_step(step_info)
await self._force_done_after_last_step(step_info)
await self._force_done_after_failure()
return browser_state_summary
@observe_debug(ignore_input=True, name='get_next_action')
@@ -768,7 +783,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Handle all other exceptions
include_trace = self.logger.isEnabledFor(logging.DEBUG)
error_msg = AgentError.format_error(error, include_trace=include_trace)
prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n '
prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures + int(self.settings.final_response_after_failure)} times:\n '
self.state.consecutive_failures += 1
# Handle InterruptedError specially
@@ -833,7 +848,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# Increment step counter after step is fully completed
self.state.n_steps += 1
async def _handle_final_step(self, step_info: AgentStepInfo | None = None) -> None:
async def _force_done_after_last_step(self, step_info: AgentStepInfo | None = None) -> None:
"""Handle special processing for the last step"""
if step_info and step_info.is_last_step():
# Add last step warning if needed
@@ -845,6 +860,19 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._message_manager._add_context_message(UserMessage(content=msg))
self.AgentOutput = self.DoneAgentOutput
async def _force_done_after_failure(self) -> None:
"""Force done after failure"""
# Create recovery message
if self.state.consecutive_failures >= self.settings.max_failures and self.settings.final_response_after_failure:
msg = f'You have failed {self.settings.max_failures} consecutive times. This is your final step to complete the task or provide what you found. '
msg += 'Use only the "done" action now. No other actions - so here your action sequence must have length 1.'
msg += '\nIf the task could not be completed due to the failures, set success in "done" to false!'
msg += '\nInclude everything you found out for the task in the done text.'
self.logger.debug('Force done action, because we reached max_failures.')
self._message_manager._add_context_message(UserMessage(content=msg))
self.AgentOutput = self.DoneAgentOutput
async def _get_model_output_with_retry(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get model output with retry logic for empty actions"""
model_output = await self.get_model_output(input_messages)
@@ -965,15 +993,172 @@ class Agent(Generic[Context, AgentStructuredOutput]):
text = re.sub(STRAY_CLOSE_TAG, '', text)
return text.strip()
# region - URL replacement
def _replace_urls_in_text(self, text: str) -> tuple[str, dict[str, str]]:
"""Replace URLs in a text string"""
replaced_urls: dict[str, str] = {}
def replace_url(match: re.Match) -> str:
"""Url can only have 1 query and 1 fragment"""
import hashlib
original_url = match.group(0)
# Find where the query/fragment starts
query_start = original_url.find('?')
fragment_start = original_url.find('#')
# Find the earliest position of query or fragment
after_path_start = len(original_url) # Default: no query/fragment
if query_start != -1:
after_path_start = min(after_path_start, query_start)
if fragment_start != -1:
after_path_start = min(after_path_start, fragment_start)
# Split URL into base (up to path) and after_path (query + fragment)
base_url = original_url[:after_path_start]
after_path = original_url[after_path_start:]
# If after_path is within the limit, don't shorten
if len(after_path) <= self._url_shortening_limit:
return original_url
# If after_path is too long, truncate and add hash
if after_path:
truncated_after_path = after_path[: self._url_shortening_limit]
# Create a short hash of the full after_path content
hash_obj = hashlib.md5(after_path.encode('utf-8'))
short_hash = hash_obj.hexdigest()[:7]
# Create shortened URL
shortened = f'{base_url}{truncated_after_path}...{short_hash}'
# Only use shortened URL if it's actually shorter than the original
if len(shortened) < len(original_url):
replaced_urls[shortened] = original_url
return shortened
return original_url
return URL_PATTERN.sub(replace_url, text), replaced_urls
def _process_messsages_and_replace_long_urls_shorter_ones(self, input_messages: list[BaseMessage]) -> dict[str, str]:
"""Replace long URLs with shorter ones
? @dev edits input_messages in place
returns:
tuple[filtered_input_messages, urls we replaced {shorter_url: original_url}]
"""
from browser_use.llm.messages import AssistantMessage, UserMessage
urls_replaced: dict[str, str] = {}
# Process each message, in place
for message in input_messages:
# no need to process SystemMessage, we have control over that anyway
if isinstance(message, (UserMessage, AssistantMessage)):
if isinstance(message.content, str):
# Simple string content
message.content, replaced_urls = self._replace_urls_in_text(message.content)
urls_replaced.update(replaced_urls)
elif isinstance(message.content, list):
# List of content parts
for part in message.content:
if isinstance(part, ContentPartTextParam):
part.text, replaced_urls = self._replace_urls_in_text(part.text)
urls_replaced.update(replaced_urls)
return urls_replaced
@staticmethod
def _recursive_process_all_strings_inside_pydantic_model(model: BaseModel, url_replacements: dict[str, str]) -> None:
"""Recursively process all strings inside a Pydantic model, replacing shortened URLs with originals in place."""
for field_name, field_value in model.__dict__.items():
if isinstance(field_value, str):
# Replace shortened URLs with original URLs in string
processed_string = Agent._replace_shortened_urls_in_string(field_value, url_replacements)
setattr(model, field_name, processed_string)
elif isinstance(field_value, BaseModel):
# Recursively process nested Pydantic models
Agent._recursive_process_all_strings_inside_pydantic_model(field_value, url_replacements)
elif isinstance(field_value, dict):
# Process dictionary values in place
Agent._recursive_process_dict(field_value, url_replacements)
elif isinstance(field_value, (list, tuple)):
processed_value = Agent._recursive_process_list_or_tuple(field_value, url_replacements)
setattr(model, field_name, processed_value)
@staticmethod
def _recursive_process_dict(dictionary: dict, url_replacements: dict[str, str]) -> None:
"""Helper method to process dictionaries."""
for k, v in dictionary.items():
if isinstance(v, str):
dictionary[k] = Agent._replace_shortened_urls_in_string(v, url_replacements)
elif isinstance(v, BaseModel):
Agent._recursive_process_all_strings_inside_pydantic_model(v, url_replacements)
elif isinstance(v, dict):
Agent._recursive_process_dict(v, url_replacements)
elif isinstance(v, (list, tuple)):
dictionary[k] = Agent._recursive_process_list_or_tuple(v, url_replacements)
@staticmethod
def _recursive_process_list_or_tuple(container: list | tuple, url_replacements: dict[str, str]) -> list | tuple:
"""Helper method to process lists and tuples."""
if isinstance(container, tuple):
# For tuples, create a new tuple with processed items
processed_items = []
for item in container:
if isinstance(item, str):
processed_items.append(Agent._replace_shortened_urls_in_string(item, url_replacements))
elif isinstance(item, BaseModel):
Agent._recursive_process_all_strings_inside_pydantic_model(item, url_replacements)
processed_items.append(item)
elif isinstance(item, dict):
Agent._recursive_process_dict(item, url_replacements)
processed_items.append(item)
elif isinstance(item, (list, tuple)):
processed_items.append(Agent._recursive_process_list_or_tuple(item, url_replacements))
else:
processed_items.append(item)
return tuple(processed_items)
else:
# For lists, modify in place
for i, item in enumerate(container):
if isinstance(item, str):
container[i] = Agent._replace_shortened_urls_in_string(item, url_replacements)
elif isinstance(item, BaseModel):
Agent._recursive_process_all_strings_inside_pydantic_model(item, url_replacements)
elif isinstance(item, dict):
Agent._recursive_process_dict(item, url_replacements)
elif isinstance(item, (list, tuple)):
container[i] = Agent._recursive_process_list_or_tuple(item, url_replacements)
return container
@staticmethod
def _replace_shortened_urls_in_string(text: str, url_replacements: dict[str, str]) -> str:
"""Replace all shortened URLs in a string with their original URLs."""
result = text
for shortened_url, original_url in url_replacements.items():
result = result.replace(shortened_url, original_url)
return result
# endregion - URL replacement
@time_execution_async('--get_next_action')
@observe_debug(ignore_input=True, ignore_output=True, name='get_model_output')
async def get_model_output(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
urls_replaced = self._process_messsages_and_replace_long_urls_shorter_ones(input_messages)
try:
response = await self.llm.ainvoke(input_messages, output_format=self.AgentOutput)
parsed = response.completion
# Replace any shortened URLs in the LLM response back to original URLs
if urls_replaced:
self._recursive_process_all_strings_inside_pydantic_model(parsed, urls_replaced)
# cut the number of actions to max_actions_per_step if needed
if len(parsed.action) > self.settings.max_actions_per_step:
parsed.action = parsed.action[: self.settings.max_actions_per_step]
@@ -994,6 +1179,11 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.debug(f'🤖 Browser-Use Library Version {self.version} ({self.source})')
def _log_first_step_startup(self) -> None:
"""Log startup message only on the first step"""
if len(self.history.history) == 0:
self.logger.info(f'🧠 Starting a browser-use version {self.version} with model={self.llm.model}')
def _log_step_context(self, browser_state_summary: BrowserStateSummary) -> None:
"""Log step context information"""
url = browser_state_summary.url if browser_state_summary else ''
@@ -1122,6 +1312,11 @@ class Agent(Generic[Context, AgentStructuredOutput]):
Returns:
Tuple[bool, bool]: (is_done, is_valid)
"""
if len(self.history.history) == 0:
# First step
self._log_first_step_startup()
await self._execute_initial_actions()
await self.step(step_info)
if self.history.is_done():
@@ -1250,17 +1445,21 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.logger.warning('⚠️ No browser focus established, may cause navigation issues')
await self._execute_initial_actions()
# Log startup message on first step (only if we haven't already done steps)
self._log_first_step_startup()
self.logger.debug(f'🔄 Starting main execution loop with max {max_steps} steps...')
for step in range(max_steps):
# Replace the polling with clean pause-wait
# Use the consolidated pause state management
if self.state.paused:
self.logger.debug(f'⏸️ Step {step}: Agent paused, waiting to resume...')
await self.wait_until_resumed()
await self._external_pause_event.wait()
signal_handler.reset()
# Check if we should stop due to too many failures
if self.state.consecutive_failures >= self.settings.max_failures:
# Check if we should stop due to too many failures, if final_response_after_failure is True, we try one last time
if (self.state.consecutive_failures) >= self.settings.max_failures + int(
self.settings.final_response_after_failure
):
self.logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
agent_run_error = f'Stopped due to {self.settings.max_failures} consecutive failures'
break
@@ -1271,12 +1470,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
agent_run_error = 'Agent stopped programmatically'
break
while self.state.paused:
await asyncio.sleep(0.5) # Small delay to prevent CPU spinning
if self.state.stopped: # Allow stopping while paused
agent_run_error = 'Agent stopped programmatically while paused'
break
if on_step_start is not None:
await on_step_start(self)
@@ -1476,7 +1669,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if orig_target_hash != new_target_hash:
# Get names of remaining actions that won't be executed
remaining_actions_str = get_remaining_actions_str(actions, i)
msg = f'Page changed after action {i} / {total_actions}: actions {remaining_actions_str} were not executed'
msg = f'Page changed after action: actions {remaining_actions_str} are not yet executed'
logger.info(msg)
results.append(
ActionResult(
@@ -1716,39 +1909,28 @@ class Agent(Generic[Context, AgentStructuredOutput]):
file_path = 'AgentHistory.json'
self.history.save_to_file(file_path)
async def wait_until_resumed(self):
await self._external_pause_event.wait()
def pause(self) -> None:
"""Pause the agent before the next step"""
print(
'\n\n⏸️ Got [Ctrl+C], paused the agent and left the browser open.\n\tPress [Enter] to resume or [Ctrl+C] again to quit.'
)
print('\n\n⏸️ Paused the agent and left the browser open.\n\tPress [Enter] to resume or [Ctrl+C] again to quit.')
self.state.paused = True
self._external_pause_event.clear()
# Task paused
# The signal handler will handle the asyncio pause logic for us
# No need to duplicate the code here
def resume(self) -> None:
"""Resume the agent"""
# TODO: Locally the browser got closed
print('----------------------------------------------------------------------')
print('▶️ Got Enter, resuming agent execution where it left off...\n')
print('▶️ Resuming agent execution where it left off...\n')
self.state.paused = False
self._external_pause_event.set()
# Task resumed
# The signal handler should have already reset the flags
# through its reset() method when called from run()
def stop(self) -> None:
"""Stop the agent"""
self.logger.info('⏹️ Agent stopping')
self.state.stopped = True
# Signal pause event to unblock any waiting code so it can check the stopped state
self._external_pause_event.set()
# Task stopped
def _convert_initial_actions(self, actions: list[dict[str, dict[str, Any]]]) -> list[ActionModel]:

View File

@@ -61,7 +61,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
</browser_state>

View File

@@ -59,7 +59,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
</browser_state>

View File

@@ -61,7 +61,7 @@ Examples:
Note that:
- Only elements with numeric indexes in [] are interactive
- (stacked) indentation (with \t) is important and means that the element is a (html) child of the element above (with a lower index)
- Elements tagged with `*[` are the new clickable elements that appeared on the website since the last step - if url has not changed.
- Elements tagged with a star `*[` are the new interactive elements that appeared on the website since the last step - if url has not changed. Your previous actions caused that change. Think if you need to interact with them, e.g. after input_text you might need to select the right option from the list.
- Pure text elements without [] are not interactive.
</browser_state>

View File

@@ -39,7 +39,7 @@ class AgentSettings(BaseModel):
override_system_message: str | None = None
extend_system_message: str | None = None
include_attributes: list[str] | None = DEFAULT_INCLUDE_ATTRIBUTES
max_actions_per_step: int = 10
max_actions_per_step: int = 4
use_thinking: bool = True
flash_mode: bool = False # If enabled, disables evaluation_previous_goal and next_goal, and sets use_thinking = False
max_history_items: int | None = None
@@ -49,17 +49,22 @@ class AgentSettings(BaseModel):
include_tool_call_examples: bool = False
llm_timeout: int = 60 # Timeout in seconds for LLM calls
step_timeout: int = 180 # Timeout in seconds for each step
final_response_after_failure: bool = True # If True, attempt one final recovery call after max_failures
class AgentState(BaseModel):
"""Holds all state information for an Agent"""
model_config = ConfigDict(arbitrary_types_allowed=True)
agent_id: str = Field(default_factory=uuid7str)
n_steps: int = 1
consecutive_failures: int = 0
last_result: list[ActionResult] | None = None
last_plan: str | None = None
last_model_output: AgentOutput | None = None
# Pause/resume state (kept serialisable for checkpointing)
paused: bool = False
stopped: bool = False
session_initialized: bool = False # Track if session events have been dispatched
@@ -68,9 +73,6 @@ class AgentState(BaseModel):
message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
file_system_state: FileSystemState | None = None
# class Config:
# arbitrary_types_allowed = True
@dataclass
class AgentStepInfo:

View File

@@ -1,6 +1,7 @@
"""Event definitions for browser communication."""
import inspect
import os
from typing import Any, Literal
from bubus import BaseEvent
@@ -11,6 +12,37 @@ from pydantic import BaseModel, Field, field_validator
from browser_use.browser.views import BrowserStateSummary
from browser_use.dom.views import EnhancedDOMTreeNode
def _get_timeout(env_var: str, default: float) -> float | None:
"""
Safely parse environment variable timeout values with robust error handling.
Args:
env_var: Environment variable name (e.g. 'TIMEOUT_NavigateToUrlEvent')
default: Default timeout value as float (e.g. 15.0)
Returns:
Parsed float value or the default if parsing fails
Raises:
ValueError: Only if both env_var and default are invalid (should not happen with valid defaults)
"""
# Try environment variable first
env_value = os.getenv(env_var)
if env_value:
try:
parsed = float(env_value)
if parsed < 0:
print(f'Warning: {env_var}={env_value} is negative, using default {default}')
return default
return parsed
except (ValueError, TypeError):
print(f'Warning: {env_var}={env_value} is not a valid number, using default {default}')
# Fall back to default
return default
# ============================================================================
# Agent/Tools -> BrowserSession Events (High-level browser actions)
# ============================================================================
@@ -88,7 +120,7 @@ class NavigateToUrlEvent(BaseEvent[None]):
# existing_tab: PageHandle | None = None # TODO
# time limits enforced by bubus, not exposed to LLM:
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_NavigateToUrlEvent', 15.0) # seconds
class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
@@ -103,7 +135,7 @@ class ClickElementEvent(ElementSelectedEvent[dict[str, Any] | None]):
# click_count: int = 1 # TODO
# expect_download: bool = False # moved to downloads_watchdog.py
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_ClickElementEvent', 15.0) # seconds
class TypeTextEvent(ElementSelectedEvent[dict | None]):
@@ -113,7 +145,7 @@ class TypeTextEvent(ElementSelectedEvent[dict | None]):
text: str
clear_existing: bool = True
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_TypeTextEvent', 15.0) # seconds
class ScrollEvent(ElementSelectedEvent[None]):
@@ -123,7 +155,7 @@ class ScrollEvent(ElementSelectedEvent[None]):
amount: int # pixels
node: 'EnhancedDOMTreeNode | None' = None # None means scroll page
event_timeout: float | None = 8.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_ScrollEvent', 8.0) # seconds
class SwitchTabEvent(BaseEvent[TargetID]):
@@ -131,7 +163,7 @@ class SwitchTabEvent(BaseEvent[TargetID]):
target_id: TargetID | None = Field(default=None, description='None means switch to the most recently opened tab')
event_timeout: float | None = 10.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_SwitchTabEvent', 10.0) # seconds
class CloseTabEvent(BaseEvent[None]):
@@ -139,7 +171,7 @@ class CloseTabEvent(BaseEvent[None]):
target_id: TargetID
event_timeout: float | None = 10.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_CloseTabEvent', 10.0) # seconds
class ScreenshotEvent(BaseEvent[str]):
@@ -148,7 +180,7 @@ class ScreenshotEvent(BaseEvent[str]):
full_page: bool = False
clip: dict[str, float] | None = None # {x, y, width, height}
event_timeout: float | None = 8.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_ScreenshotEvent', 8.0) # seconds
class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
@@ -159,7 +191,7 @@ class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
cache_clickable_elements_hashes: bool = True
include_recent_events: bool = False
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStateRequestEvent', 30.0) # seconds
# class WaitForConditionEvent(BaseEvent):
@@ -174,19 +206,19 @@ class BrowserStateRequestEvent(BaseEvent[BrowserStateSummary]):
class GoBackEvent(BaseEvent[None]):
"""Navigate back in browser history."""
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_GoBackEvent', 15.0) # seconds
class GoForwardEvent(BaseEvent[None]):
"""Navigate forward in browser history."""
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_GoForwardEvent', 15.0) # seconds
class RefreshEvent(BaseEvent[None]):
"""Refresh/reload the current page."""
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_RefreshEvent', 15.0) # seconds
class WaitEvent(BaseEvent[None]):
@@ -195,7 +227,7 @@ class WaitEvent(BaseEvent[None]):
seconds: float = 3.0
max_seconds: float = 10.0 # Safety cap
event_timeout: float | None = 60.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_WaitEvent', 60.0) # seconds
class SendKeysEvent(BaseEvent[None]):
@@ -203,7 +235,7 @@ class SendKeysEvent(BaseEvent[None]):
keys: str # e.g., "ctrl+a", "cmd+c", "Enter"
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_SendKeysEvent', 15.0) # seconds
class UploadFileEvent(ElementSelectedEvent[None]):
@@ -212,7 +244,7 @@ class UploadFileEvent(ElementSelectedEvent[None]):
node: 'EnhancedDOMTreeNode'
file_path: str
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_UploadFileEvent', 30.0) # seconds
class GetDropdownOptionsEvent(ElementSelectedEvent[dict[str, str]]):
@@ -222,9 +254,10 @@ class GetDropdownOptionsEvent(ElementSelectedEvent[dict[str, str]]):
node: 'EnhancedDOMTreeNode'
event_timeout: float | None = (
15.0 # some dropdowns lazy-load the list of options on first interaction, so we need to wait for them to load (e.g. table filter lists can have thousands of options)
)
event_timeout: float | None = _get_timeout(
'TIMEOUT_GetDropdownOptionsEvent',
15.0,
) # some dropdowns lazy-load the list of options on first interaction, so we need to wait for them to load (e.g. table filter lists can have thousands of options)
class SelectDropdownOptionEvent(ElementSelectedEvent[dict[str, str]]):
@@ -235,7 +268,7 @@ class SelectDropdownOptionEvent(ElementSelectedEvent[dict[str, str]]):
node: 'EnhancedDOMTreeNode'
text: str # The option text to select
event_timeout: float | None = 8.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_SelectDropdownOptionEvent', 8.0) # seconds
class ScrollToTextEvent(BaseEvent[None]):
@@ -244,7 +277,7 @@ class ScrollToTextEvent(BaseEvent[None]):
text: str
direction: Literal['up', 'down'] = 'down'
event_timeout: float | None = 15.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_ScrollToTextEvent', 15.0) # seconds
# ============================================================================
@@ -256,7 +289,7 @@ class BrowserStartEvent(BaseEvent):
cdp_url: str | None = None
launch_options: dict[str, Any] = Field(default_factory=dict)
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStartEvent', 30.0) # seconds
class BrowserStopEvent(BaseEvent):
@@ -264,7 +297,7 @@ class BrowserStopEvent(BaseEvent):
force: bool = False
event_timeout: float | None = 45.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStopEvent', 45.0) # seconds
class BrowserLaunchResult(BaseModel):
@@ -279,13 +312,13 @@ class BrowserLaunchEvent(BaseEvent[BrowserLaunchResult]):
# TODO: add executable_path, proxy settings, preferences, extra launch args, etc.
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserLaunchEvent', 30.0) # seconds
class BrowserKillEvent(BaseEvent):
"""Kill local browser subprocess."""
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserKillEvent', 30.0) # seconds
# TODO: replace all Runtime.evaluate() calls with this event
@@ -338,7 +371,7 @@ class BrowserConnectedEvent(BaseEvent):
cdp_url: str
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserConnectedEvent', 30.0) # seconds
class BrowserStoppedEvent(BaseEvent):
@@ -346,7 +379,7 @@ class BrowserStoppedEvent(BaseEvent):
reason: str | None = None
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserStoppedEvent', 30.0) # seconds
class TabCreatedEvent(BaseEvent):
@@ -355,7 +388,7 @@ class TabCreatedEvent(BaseEvent):
target_id: TargetID
url: str
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_TabCreatedEvent', 30.0) # seconds
class TabClosedEvent(BaseEvent):
@@ -367,7 +400,7 @@ class TabClosedEvent(BaseEvent):
# new_focus_target_id: int | None = None
# new_focus_url: str | None = None
event_timeout: float | None = 10.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_TabClosedEvent', 10.0) # seconds
# TODO: emit this when DOM changes significantly, inner frame navigates, form submits, history.pushState(), etc.
@@ -384,7 +417,7 @@ class AgentFocusChangedEvent(BaseEvent):
target_id: TargetID
url: str
event_timeout: float | None = 10.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_AgentFocusChangedEvent', 10.0) # seconds
class TargetCrashedEvent(BaseEvent):
@@ -393,7 +426,7 @@ class TargetCrashedEvent(BaseEvent):
target_id: TargetID
error: str
event_timeout: float | None = 10.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_TargetCrashedEvent', 10.0) # seconds
class NavigationStartedEvent(BaseEvent):
@@ -402,7 +435,7 @@ class NavigationStartedEvent(BaseEvent):
target_id: TargetID
url: str
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_NavigationStartedEvent', 30.0) # seconds
class NavigationCompleteEvent(BaseEvent):
@@ -414,7 +447,7 @@ class NavigationCompleteEvent(BaseEvent):
error_message: str | None = None # Error/timeout message if navigation had issues
loading_status: str | None = None # Detailed loading status (e.g., network timeout info)
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_NavigationCompleteEvent', 30.0) # seconds
# ============================================================================
@@ -429,7 +462,7 @@ class BrowserErrorEvent(BaseEvent):
message: str
details: dict[str, Any] = Field(default_factory=dict)
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_BrowserErrorEvent', 30.0) # seconds
# ============================================================================
@@ -442,7 +475,7 @@ class SaveStorageStateEvent(BaseEvent):
path: str | None = None # Optional path, uses profile default if not provided
event_timeout: float | None = 45.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_SaveStorageStateEvent', 45.0) # seconds
class StorageStateSavedEvent(BaseEvent):
@@ -452,7 +485,7 @@ class StorageStateSavedEvent(BaseEvent):
cookies_count: int
origins_count: int
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_StorageStateSavedEvent', 30.0) # seconds
class LoadStorageStateEvent(BaseEvent):
@@ -460,7 +493,7 @@ class LoadStorageStateEvent(BaseEvent):
path: str | None = None # Optional path, uses profile default if not provided
event_timeout: float | None = 45.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_LoadStorageStateEvent', 45.0) # seconds
# TODO: refactor this to:
@@ -474,7 +507,7 @@ class StorageStateLoadedEvent(BaseEvent):
cookies_count: int
origins_count: int
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_StorageStateLoadedEvent', 30.0) # seconds
# ============================================================================
@@ -494,7 +527,7 @@ class FileDownloadedEvent(BaseEvent):
from_cache: bool = False
auto_download: bool = False # Whether this was an automatic download (e.g., PDF auto-download)
event_timeout: float | None = 30.0 # seconds
event_timeout: float | None = _get_timeout('TIMEOUT_FileDownloadedEvent', 30.0) # seconds
class AboutBlankDVDScreensaverShownEvent(BaseEvent):
@@ -510,7 +543,7 @@ class DialogOpenedEvent(BaseEvent):
dialog_type: str # 'alert', 'confirm', 'prompt', or 'beforeunload'
message: str
url: str
frame_id: str
frame_id: str | None = None # Can be None when frameId is not provided by CDP
# target_id: TargetID # TODO: add this to avoid needing target_id_from_frame() later

View File

@@ -10,7 +10,6 @@ from urllib.parse import urlparse
from pydantic import AfterValidator, AliasChoices, BaseModel, ConfigDict, Field, field_validator, model_validator
from browser_use.config import CONFIG
from browser_use.observability import observe_debug
from browser_use.utils import _log_pretty_path, logger
CHROME_DEBUG_PORT = 9242 # use a non-default port to avoid conflicts with other tools / devs using 9222
@@ -616,6 +615,18 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# save_har_path: alias of record_har_path
# trace_path: alias of traces_dir
# these shadow the old playwright args on BrowserContextArgs, but it's ok
# because we handle them ourselves in a watchdog and we no longer use playwright, so they should live in the scope for our own config in BrowserProfile long-term
record_video_dir: Path | None = Field(
default=None,
description='Directory to save video recordings. If set, a video of the session will be recorded.',
validation_alias=AliasChoices('save_recording_path', 'record_video_dir'),
)
record_video_size: ViewportSize | None = Field(
default=None, description='Video frame size. If not set, it will use the viewport size.'
)
record_video_framerate: int = Field(default=30, description='The framerate to use for the video recording.')
# TODO: finish implementing extension support in extensions.py
# extension_ids_to_preinstall: list[str] = Field(
# default_factory=list, description='List of Chrome extension IDs to preinstall.'
@@ -747,6 +758,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
if proxy_bypass:
pre_conversion_args.append(f'--proxy-bypass-list={proxy_bypass}')
# User agent flag
if self.user_agent:
pre_conversion_args.append(f'--user-agent={self.user_agent}')
# Special handling for --disable-features to merge values instead of overwriting
# This prevents disable_security=True from breaking extensions by ensuring
# both default features (including extension-related) and security features are preserved
@@ -776,6 +791,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
# convert to dict and back to dedupe and merge other duplicate args
final_args_list = BrowserLaunchArgs.args_as_list(BrowserLaunchArgs.args_as_dict(non_disable_features_args))
return final_args_list
def _get_extension_args(self) -> list[str]:
@@ -1016,7 +1032,6 @@ async function initialize(checkInitialized, magic) {{
os.unlink(temp_zip.name)
@observe_debug(ignore_input=True, ignore_output=True, name='detect_display_configuration')
def detect_display_configuration(self) -> None:
"""
Detect the system display size and initialize the display-related config defaults:
@@ -1031,36 +1046,43 @@ async function initialize(checkInitialized, magic) {{
if self.headless is None:
self.headless = not has_screen_available
# set up window size and position if headful
# Determine viewport behavior based on mode and user preferences
user_provided_viewport = self.viewport is not None
if self.headless:
# headless mode: no window available, use viewport instead to constrain content size
# Headless mode: always use viewport for content size control
self.viewport = self.viewport or self.window_size or self.screen
self.window_position = None # no windows to position in headless mode
self.window_position = None
self.window_size = None
self.no_viewport = False # viewport is always enabled in headless mode
self.no_viewport = False
else:
# headful mode: use window, disable viewport by default, content fits to size of window
# Headful mode: respect user's viewport preference
self.window_size = self.window_size or self.screen
self.no_viewport = True if self.no_viewport is None else self.no_viewport
self.viewport = None if self.no_viewport else self.viewport
# automatically setup viewport if any config requires it
use_viewport = self.headless or self.viewport or self.device_scale_factor
self.no_viewport = not use_viewport if self.no_viewport is None else self.no_viewport
use_viewport = not self.no_viewport
if user_provided_viewport:
# User explicitly set viewport - enable viewport mode
self.no_viewport = False
else:
# Default headful: content fits to window (no viewport)
self.no_viewport = True if self.no_viewport is None else self.no_viewport
if use_viewport:
# if we are using viewport, make device_scale_factor and screen are set to real values to avoid easy fingerprinting
# Handle special requirements (device_scale_factor forces viewport mode)
if self.device_scale_factor and self.no_viewport is None:
self.no_viewport = False
# Finalize configuration
if self.no_viewport:
# No viewport mode: content adapts to window
self.viewport = None
self.device_scale_factor = None
self.screen = None
assert self.viewport is None
assert self.no_viewport is True
else:
# Viewport mode: ensure viewport is set
self.viewport = self.viewport or self.screen
self.device_scale_factor = self.device_scale_factor or 1.0
assert self.viewport is not None
assert self.no_viewport is False
else:
# device_scale_factor and screen are not supported non-viewport mode, the system monitor determines these
self.viewport = None
self.device_scale_factor = None # only supported in viewport mode
self.screen = None # only supported in viewport mode
assert self.viewport is None
assert self.no_viewport is True
assert not (self.headless and self.no_viewport), 'headless=True and no_viewport=True cannot both be set at the same time'

View File

@@ -18,6 +18,57 @@ from browser_use.utils import time_execution_async
logger = logging.getLogger(__name__)
# Font cache to prevent repeated font loading and reduce memory usage
_FONT_CACHE: dict[tuple[str, int], ImageFont.FreeTypeFont | None] = {}
# Cross-platform font paths
_FONT_PATHS = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', # Linux (Debian/Ubuntu)
'/usr/share/fonts/TTF/DejaVuSans-Bold.ttf', # Linux (Arch/Fedora)
'/System/Library/Fonts/Arial.ttf', # macOS
'C:\\Windows\\Fonts\\arial.ttf', # Windows
'arial.ttf', # Windows (system path)
'Arial Bold.ttf', # macOS alternative
'/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf', # Linux alternative
]
def get_cross_platform_font(font_size: int) -> ImageFont.FreeTypeFont | None:
"""Get a cross-platform compatible font with caching to prevent memory leaks.
Args:
font_size: Size of the font to load
Returns:
ImageFont object or None if no system fonts are available
"""
# Use cache key based on font size
cache_key = ('system_font', font_size)
# Return cached font if available
if cache_key in _FONT_CACHE:
return _FONT_CACHE[cache_key]
# Try to load a system font
font = None
for font_path in _FONT_PATHS:
try:
font = ImageFont.truetype(font_path, font_size)
break
except OSError:
continue
# Cache the result (even if None) to avoid repeated attempts
_FONT_CACHE[cache_key] = font
return font
def cleanup_font_cache() -> None:
"""Clean up the font cache to prevent memory leaks in long-running applications."""
global _FONT_CACHE
_FONT_CACHE.clear()
# Color scheme for different element types
ELEMENT_COLORS = {
'button': '#FF6B6B', # Red for buttons
@@ -102,18 +153,10 @@ def draw_enhanced_bounding_box_with_text(
css_width = img_width # / device_pixel_ratio
# Much smaller scaling - 1% of CSS viewport width, max 16px to prevent huge highlights
base_font_size = max(10, min(20, int(css_width * 0.01)))
big_font = None
try:
big_font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', base_font_size)
except OSError:
try:
big_font = ImageFont.truetype('arial.ttf', base_font_size)
except OSError:
# Try system fonts on different platforms
try:
big_font = ImageFont.truetype('Arial Bold.ttf', base_font_size)
except OSError:
big_font = font # Fallback to original font
# Use shared font loading function with caching
big_font = get_cross_platform_font(base_font_size)
if big_font is None:
big_font = font # Fallback to original font if no system fonts found
# Get text size with bigger font
if big_font:
@@ -391,15 +434,9 @@ async def create_highlighted_screenshot(
# Create drawing context
draw = ImageDraw.Draw(image)
# Try to load a font, fall back to default if not available
font = None
try:
font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', 12)
except OSError:
try:
font = ImageFont.truetype('arial.ttf', 12)
except OSError:
font = None # Use default font
# Load font using shared function with caching
font = get_cross_platform_font(12)
# If no system fonts found, font remains None and will use default font
# Process elements sequentially to avoid ImageDraw thread safety issues
# PIL ImageDraw is not thread-safe, so we process elements one by one
@@ -408,16 +445,24 @@ async def create_highlighted_screenshot(
# Convert back to base64
output_buffer = io.BytesIO()
image.save(output_buffer, format='PNG')
output_buffer.seek(0)
try:
image.save(output_buffer, format='PNG')
output_buffer.seek(0)
highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
highlighted_b64 = base64.b64encode(output_buffer.getvalue()).decode('utf-8')
logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
return highlighted_b64
logger.debug(f'Successfully created highlighted screenshot with {len(selector_map)} elements')
return highlighted_b64
finally:
# Explicit cleanup to prevent memory leaks
output_buffer.close()
if 'image' in locals():
image.close()
except Exception as e:
logger.error(f'Failed to create highlighted screenshot: {e}')
# Clean up on error as well
if 'image' in locals():
image.close()
# Return original screenshot on error
return screenshot_b64
@@ -463,6 +508,7 @@ async def create_highlighted_screenshot_async(
screenshot_b64: Base64 encoded screenshot
selector_map: Map of interactive elements
cdp_session: CDP session for getting viewport info
filter_highlight_ids: Whether to filter element IDs based on meaningful text
Returns:
Base64 encoded highlighted screenshot
@@ -496,3 +542,7 @@ async def create_highlighted_screenshot_async(
await asyncio.to_thread(_write_screenshot)
return final_screenshot
# Export the cleanup function for external use in long-running applications
__all__ = ['create_highlighted_screenshot', 'create_highlighted_screenshot_async', 'cleanup_font_cache']

View File

@@ -44,9 +44,6 @@ from browser_use.utils import _log_pretty_url, is_new_tab_page
DEFAULT_BROWSER_PROFILE = BrowserProfile()
MAX_SCREENSHOT_HEIGHT = 2000
MAX_SCREENSHOT_WIDTH = 1920
_LOGGED_UNIQUE_SESSION_IDS = set() # track unique session IDs that have been logged to make sure we always assign a unique enough id to new sessions and avoid ambiguity in logs
red = '\033[91m'
reset = '\033[0m'
@@ -247,6 +244,8 @@ class BrowserSession(BaseModel):
record_har_mode: str | None = None,
record_har_path: str | Path | None = None,
record_video_dir: str | Path | None = None,
record_video_framerate: int | None = None,
record_video_size: dict | None = None,
# From BrowserLaunchPersistentContextArgs
user_data_dir: str | Path | None = None,
# From BrowserNewContextArgs
@@ -338,6 +337,7 @@ class BrowserSession(BaseModel):
_dom_watchdog: Any | None = PrivateAttr(default=None)
_screenshot_watchdog: Any | None = PrivateAttr(default=None)
_permissions_watchdog: Any | None = PrivateAttr(default=None)
_recording_watchdog: Any | None = PrivateAttr(default=None)
_logger: Any = PrivateAttr(default=None)
@@ -404,6 +404,7 @@ class BrowserSession(BaseModel):
self._dom_watchdog = None
self._screenshot_watchdog = None
self._permissions_watchdog = None
self._recording_watchdog = None
def model_post_init(self, __context) -> None:
"""Register event handlers after model initialization."""
@@ -425,6 +426,7 @@ class BrowserSession(BaseModel):
BaseWatchdog.attach_handler_to_session(self, BrowserStopEvent, self.on_BrowserStopEvent)
BaseWatchdog.attach_handler_to_session(self, NavigateToUrlEvent, self.on_NavigateToUrlEvent)
BaseWatchdog.attach_handler_to_session(self, SwitchTabEvent, self.on_SwitchTabEvent)
BaseWatchdog.attach_handler_to_session(self, TabCreatedEvent, self.on_TabCreatedEvent)
BaseWatchdog.attach_handler_to_session(self, TabClosedEvent, self.on_TabClosedEvent)
BaseWatchdog.attach_handler_to_session(self, AgentFocusChangedEvent, self.on_AgentFocusChangedEvent)
BaseWatchdog.attach_handler_to_session(self, FileDownloadedEvent, self.on_FileDownloadedEvent)
@@ -707,6 +709,22 @@ class BrowserSession(BaseModel):
await self.event_bus.dispatch(TabClosedEvent(target_id=event.target_id))
await cdp_session.cdp_client.send.Target.closeTarget(params={'targetId': event.target_id})
async def on_TabCreatedEvent(self, event: TabCreatedEvent) -> None:
"""Handle tab creation - apply viewport settings to new tab."""
# Apply viewport settings if configured
if self.browser_profile.viewport and not self.browser_profile.no_viewport:
try:
viewport_width = self.browser_profile.viewport.width
viewport_height = self.browser_profile.viewport.height
device_scale_factor = self.browser_profile.device_scale_factor or 1.0
# Use the helper method with the new tab's target_id
await self._cdp_set_viewport(viewport_width, viewport_height, device_scale_factor, target_id=event.target_id)
self.logger.debug(f'Applied viewport {viewport_width}x{viewport_height} to tab {event.target_id[-8:]}')
except Exception as e:
self.logger.warning(f'Failed to set viewport for new tab {event.target_id[-8:]}: {e}')
async def on_TabClosedEvent(self, event: TabClosedEvent) -> None:
"""Handle tab closure - update focus if needed."""
if not self.agent_focus:
@@ -955,9 +973,10 @@ class BrowserSession(BaseModel):
from browser_use.browser.watchdogs.local_browser_watchdog import LocalBrowserWatchdog
from browser_use.browser.watchdogs.permissions_watchdog import PermissionsWatchdog
from browser_use.browser.watchdogs.popups_watchdog import PopupsWatchdog
from browser_use.browser.watchdogs.recording_watchdog import RecordingWatchdog
from browser_use.browser.watchdogs.screenshot_watchdog import ScreenshotWatchdog
from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog
# from browser_use.browser.storage_state_watchdog import StorageStateWatchdog
from browser_use.browser.watchdogs.storage_state_watchdog import StorageStateWatchdog
# Initialize CrashWatchdog
# CrashWatchdog.model_rebuild()
@@ -978,14 +997,27 @@ class BrowserSession(BaseModel):
if self.browser_profile.auto_download_pdfs:
self.logger.debug('📄 PDF auto-download enabled for this session')
# # Initialize StorageStateWatchdog
# StorageStateWatchdog.model_rebuild()
# self._storage_state_watchdog = StorageStateWatchdog(event_bus=self.event_bus, browser_session=self)
# # self.event_bus.on(BrowserConnectedEvent, self._storage_state_watchdog.on_BrowserConnectedEvent)
# # self.event_bus.on(BrowserStopEvent, self._storage_state_watchdog.on_BrowserStopEvent)
# # self.event_bus.on(SaveStorageStateEvent, self._storage_state_watchdog.on_SaveStorageStateEvent)
# # self.event_bus.on(LoadStorageStateEvent, self._storage_state_watchdog.on_LoadStorageStateEvent)
# self._storage_state_watchdog.attach_to_session()
# Initialize StorageStateWatchdog conditionally
# Enable when user provides either storage_state or user_data_dir (indicating they want persistence)
should_enable_storage_state = (
self.browser_profile.storage_state is not None or self.browser_profile.user_data_dir is not None
)
if should_enable_storage_state:
StorageStateWatchdog.model_rebuild()
self._storage_state_watchdog = StorageStateWatchdog(
event_bus=self.event_bus,
browser_session=self,
# More conservative defaults when auto-enabled
auto_save_interval=60.0, # 1 minute instead of 30 seconds
save_on_change=False, # Only save on shutdown by default
)
self._storage_state_watchdog.attach_to_session()
self.logger.debug(
f'🍪 StorageStateWatchdog enabled (storage_state: {bool(self.browser_profile.storage_state)}, user_data_dir: {bool(self.browser_profile.user_data_dir)})'
)
else:
self.logger.debug('🍪 StorageStateWatchdog disabled (no storage_state or user_data_dir configured)')
# Initialize LocalBrowserWatchdog
LocalBrowserWatchdog.model_rebuild()
@@ -1054,6 +1086,11 @@ class BrowserSession(BaseModel):
# self.event_bus.on(BrowserStateRequestEvent, self._dom_watchdog.on_BrowserStateRequestEvent)
self._dom_watchdog.attach_to_session()
# Initialize RecordingWatchdog (handles video recording)
RecordingWatchdog.model_rebuild()
self._recording_watchdog = RecordingWatchdog(event_bus=self.event_bus, browser_session=self)
self._recording_watchdog.attach_to_session()
# Mark watchdogs as attached to prevent duplicate attachment
self._watchdogs_attached = True
@@ -1631,7 +1668,7 @@ class BrowserSession(BaseModel):
"""Get list of files downloaded during this browser session.
Returns:
list[str]: List of absolute file paths to downloaded files in this session
list[str]: List of absolute file paths to downloaded files in this session
"""
return self._downloaded_files.copy()
@@ -1758,22 +1795,119 @@ class BrowserSession(BaseModel):
params={'identifier': identifier}, session_id=cdp_session.session_id
)
async def _cdp_set_viewport(self, width: int, height: int, device_scale_factor: float = 1.0, mobile: bool = False) -> None:
"""Set viewport using CDP Emulation.setDeviceMetricsOverride."""
await self.cdp_client.send.Emulation.setDeviceMetricsOverride(
params={'width': width, 'height': height, 'deviceScaleFactor': device_scale_factor, 'mobile': mobile}
async def _cdp_set_viewport(
self, width: int, height: int, device_scale_factor: float = 1.0, mobile: bool = False, target_id: str | None = None
) -> None:
"""Set viewport using CDP Emulation.setDeviceMetricsOverride.
Args:
width: Viewport width
height: Viewport height
device_scale_factor: Device scale factor (default 1.0)
mobile: Whether to emulate mobile device (default False)
target_id: Optional target ID to set viewport for. If not provided, uses agent_focus.
"""
if target_id:
# Set viewport for specific target
cdp_session = await self.get_or_create_cdp_session(target_id, focus=False, new_socket=False)
elif self.agent_focus:
# Use current focus
cdp_session = self.agent_focus
else:
self.logger.warning('Cannot set viewport: no target_id provided and agent_focus not initialized')
return
await cdp_session.cdp_client.send.Emulation.setDeviceMetricsOverride(
params={'width': width, 'height': height, 'deviceScaleFactor': device_scale_factor, 'mobile': mobile},
session_id=cdp_session.session_id,
)
async def _cdp_get_origins(self) -> list[dict[str, Any]]:
"""Get origins with localStorage and sessionStorage using CDP."""
origins = []
cdp_session = await self.get_or_create_cdp_session(target_id=None, new_socket=False)
try:
# Enable DOMStorage domain to track storage
await cdp_session.cdp_client.send.DOMStorage.enable(session_id=cdp_session.session_id)
try:
# Get all frames to find unique origins
frames_result = await cdp_session.cdp_client.send.Page.getFrameTree(session_id=cdp_session.session_id)
# Extract unique origins from frames
unique_origins = set()
def _extract_origins(frame_tree):
"""Recursively extract origins from frame tree."""
frame = frame_tree.get('frame', {})
origin = frame.get('securityOrigin')
if origin and origin != 'null':
unique_origins.add(origin)
# Process child frames
for child in frame_tree.get('childFrames', []):
_extract_origins(child)
async def _get_storage_items(origin: str, is_local_storage: bool) -> list[dict[str, str]] | None:
"""Helper to get storage items for an origin."""
storage_type = 'localStorage' if is_local_storage else 'sessionStorage'
try:
result = await cdp_session.cdp_client.send.DOMStorage.getDOMStorageItems(
params={'storageId': {'securityOrigin': origin, 'isLocalStorage': is_local_storage}},
session_id=cdp_session.session_id,
)
items = []
for item in result.get('entries', []):
if len(item) == 2: # Each item is [key, value]
items.append({'name': item[0], 'value': item[1]})
return items if items else None
except Exception as e:
self.logger.debug(f'Failed to get {storage_type} for {origin}: {e}')
return None
_extract_origins(frames_result.get('frameTree', {}))
# For each unique origin, get localStorage and sessionStorage
for origin in unique_origins:
origin_data = {'origin': origin}
# Get localStorage
local_storage = await _get_storage_items(origin, is_local_storage=True)
if local_storage:
origin_data['localStorage'] = local_storage
# Get sessionStorage
session_storage = await _get_storage_items(origin, is_local_storage=False)
if session_storage:
origin_data['sessionStorage'] = session_storage
# Only add origin if it has storage data
if 'localStorage' in origin_data or 'sessionStorage' in origin_data:
origins.append(origin_data)
finally:
# Always disable DOMStorage tracking when done
await cdp_session.cdp_client.send.DOMStorage.disable(session_id=cdp_session.session_id)
except Exception as e:
self.logger.warning(f'Failed to get origins: {e}')
return origins
async def _cdp_get_storage_state(self) -> dict:
"""Get storage state (cookies, localStorage, sessionStorage) using CDP."""
# Use the _cdp_get_cookies helper which handles session attachment
cookies = await self._cdp_get_cookies()
# Get localStorage and sessionStorage would require evaluating JavaScript
# on each origin, which is more complex. For now, return cookies only.
# Get origins with localStorage/sessionStorage
origins = await self._cdp_get_origins()
return {
'cookies': cookies,
'origins': [], # Would need to iterate through origins for localStorage/sessionStorage
'origins': origins,
}
async def _cdp_navigate(self, url: str, target_id: TargetID | None = None) -> None:

View File

@@ -0,0 +1,162 @@
"""Video Recording Service for Browser Use Sessions."""
import base64
import logging
import math
import subprocess
from pathlib import Path
from typing import Optional
from browser_use.browser.profile import ViewportSize
try:
import imageio.v2 as iio
import imageio_ffmpeg
import numpy as np
from imageio.core.format import Format
IMAGEIO_AVAILABLE = True
except ImportError:
IMAGEIO_AVAILABLE = False
logger = logging.getLogger(__name__)
def _get_padded_size(size: ViewportSize, macro_block_size: int = 16) -> ViewportSize:
"""Calculates the dimensions padded to the nearest multiple of macro_block_size."""
width = int(math.ceil(size['width'] / macro_block_size)) * macro_block_size
height = int(math.ceil(size['height'] / macro_block_size)) * macro_block_size
return ViewportSize(width=width, height=height)
class VideoRecorderService:
"""
Handles the video encoding process for a browser session using imageio.
This service captures individual frames from the CDP screencast, decodes them,
and appends them to a video file using a pip-installable ffmpeg backend.
It automatically resizes frames to match the target video dimensions.
"""
def __init__(self, output_path: Path, size: ViewportSize, framerate: int):
"""
Initializes the video recorder.
Args:
output_path: The full path where the video will be saved.
size: A ViewportSize object specifying the width and height of the video.
framerate: The desired framerate for the output video.
"""
self.output_path = output_path
self.size = size
self.framerate = framerate
self._writer: Optional['Format.Writer'] = None
self._is_active = False
self.padded_size = _get_padded_size(self.size)
def start(self) -> None:
"""
Prepares and starts the video writer.
If the required optional dependencies are not installed, this method will
log an error and do nothing.
"""
if not IMAGEIO_AVAILABLE:
logger.error(
'MP4 recording requires optional dependencies. Please install them with: pip install "browser-use[video]"'
)
return
try:
self.output_path.parent.mkdir(parents=True, exist_ok=True)
# The macro_block_size is set to None because we handle padding ourselves
self._writer = iio.get_writer(
str(self.output_path),
fps=self.framerate,
codec='libx264',
quality=8, # A good balance of quality and file size (1-10 scale)
pixelformat='yuv420p', # Ensures compatibility with most players
macro_block_size=None,
)
self._is_active = True
logger.debug(f'Video recorder started. Output will be saved to {self.output_path}')
except Exception as e:
logger.error(f'Failed to initialize video writer: {e}')
self._is_active = False
def add_frame(self, frame_data_b64: str) -> None:
"""
Decodes a base64-encoded PNG frame, resizes it, pads it to be codec-compatible,
and appends it to the video.
Args:
frame_data_b64: A base64-encoded string of the PNG frame data.
"""
if not self._is_active or not self._writer:
return
try:
frame_bytes = base64.b64decode(frame_data_b64)
# Build a filter chain for ffmpeg:
# 1. scale: Resizes the frame to the user-specified dimensions.
# 2. pad: Adds black bars to meet codec's macro-block requirements,
# centering the original content.
vf_chain = (
f'scale={self.size["width"]}:{self.size["height"]},'
f'pad={self.padded_size["width"]}:{self.padded_size["height"]}:(ow-iw)/2:(oh-ih)/2:color=black'
)
output_pix_fmt = 'rgb24'
command = [
imageio_ffmpeg.get_ffmpeg_exe(),
'-f',
'image2pipe', # Input format from a pipe
'-c:v',
'png', # Specify input codec is PNG
'-i',
'-', # Input from stdin
'-vf',
vf_chain, # Video filter for resizing and padding
'-f',
'rawvideo', # Output format is raw video
'-pix_fmt',
output_pix_fmt, # Output pixel format
'-', # Output to stdout
]
# Execute ffmpeg as a subprocess
proc = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = proc.communicate(input=frame_bytes)
if proc.returncode != 0:
err_msg = err.decode(errors='ignore').strip()
if 'deprecated pixel format used' not in err_msg.lower():
raise OSError(f'ffmpeg error during resizing/padding: {err_msg}')
else:
logger.debug(f'ffmpeg warning during resizing/padding: {err_msg}')
# Convert the raw output bytes to a numpy array with the padded dimensions
img_array = np.frombuffer(out, dtype=np.uint8).reshape((self.padded_size['height'], self.padded_size['width'], 3))
self._writer.append_data(img_array)
except Exception as e:
logger.warning(f'Could not process and add video frame: {e}')
def stop_and_save(self) -> None:
"""
Finalizes the video file by closing the writer.
This method should be called when the recording session is complete.
"""
if not self._is_active or not self._writer:
return
try:
self._writer.close()
logger.info(f'📹 Video recording saved successfully to: {self.output_path}')
except Exception as e:
logger.error(f'Failed to finalize and save video: {e}')
finally:
self._is_active = False
self._writer = None

View File

@@ -6,16 +6,16 @@ from typing import ClassVar
from bubus import BaseEvent
from pydantic import PrivateAttr
from browser_use.browser.events import DialogOpenedEvent, TabCreatedEvent
from browser_use.browser.events import TabCreatedEvent
from browser_use.browser.watchdog_base import BaseWatchdog
class PopupsWatchdog(BaseWatchdog):
"""Handles JavaScript dialogs (alert, confirm, prompt) by automatically accepting them."""
"""Handles JavaScript dialogs (alert, confirm, prompt) by automatically accepting them immediately."""
# Events this watchdog listens to and emits
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [TabCreatedEvent, DialogOpenedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [DialogOpenedEvent]
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [TabCreatedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = []
# Track which targets have dialog handlers registered
_dialog_listeners_registered: set[str] = PrivateAttr(default_factory=set)
@@ -36,107 +36,77 @@ class PopupsWatchdog(BaseWatchdog):
self.logger.debug(f'📌 Starting dialog handler setup for target {target_id}')
try:
# Get all CDP sessions for this target and any child frames
cdp_session = await self.browser_session.get_or_create_cdp_session(
target_id, focus=False
) # don't auto-focus new tabs! sometimes we need to open tabs in background
# Set up async handler for JavaScript dialogs - now we can handle them immediately!
# Also register for the root CDP client to catch dialogs from any frame
if self.browser_session._cdp_client_root:
self.logger.debug('📌 Also registering handler on root CDP client')
# Set up async handler for JavaScript dialogs - accept immediately without event dispatch
async def handle_dialog(event_data, session_id: str | None = None):
"""Handle JavaScript dialog events - accept immediately and dispatch event."""
self.logger.debug(f'🚨 DIALOG EVENT RECEIVED: {event_data}, session_id={session_id}')
dialog_type = event_data.get('type', 'alert')
message = event_data.get('message', '')
url = event_data.get('url')
frame_id = event_data.get('frameId')
self.logger.debug(f"🔔 JavaScript {dialog_type} dialog detected: '{message[:50]}...' - accepting immediately")
# Dispatch the event first so tests can observe it
event = self.browser_session.event_bus.dispatch(
DialogOpenedEvent(
frame_id=frame_id,
dialog_type=dialog_type,
message=message,
url=url,
)
)
await event.event_result(raise_if_none=False, raise_if_any=True, timeout=5.0)
# Accept the dialog immediately to unblock the browser
"""Handle JavaScript dialog events - accept immediately."""
try:
if self.browser_session._cdp_client_root and session_id:
self.logger.debug('🔄 Sending handleJavaScriptDialog command')
await self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
params={'accept': True},
session_id=session_id,
)
self.logger.info('✅ Dialog accepted successfully')
else:
self.logger.error('Cannot accept dialog - CDP client or session not available')
except Exception as e:
self.logger.error(f'Failed to accept dialog: {e}')
dialog_type = event_data.get('type', 'alert')
message = event_data.get('message', '')
self.logger.info(f"🔔 JavaScript {dialog_type} dialog: '{message[:100]}' - attempting to accept...")
self.logger.debug('Trying all approaches to accept dialog...')
# Approach 1: Use the session that detected the dialog
if self.browser_session._cdp_client_root and session_id:
try:
self.logger.debug(f'🔄 Approach 1: Using session {session_id}')
await asyncio.wait_for(
self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
params={'accept': True},
session_id=session_id,
),
timeout=0.25,
)
except (TimeoutError, Exception) as e:
pass
# Approach 2: Try with current agent focus session
if self.browser_session._cdp_client_root and self.browser_session.agent_focus:
try:
self.logger.debug(
f'🔄 Approach 2: Using agent focus session {self.browser_session.agent_focus.session_id}'
)
await asyncio.wait_for(
self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
params={'accept': True},
session_id=self.browser_session.agent_focus.session_id,
),
timeout=0.25,
)
except (TimeoutError, Exception) as e:
pass
except Exception as e:
self.logger.error(f'❌ Critical error in dialog handler: {type(e).__name__}: {e}')
# Register handler on the specific session
cdp_session.cdp_client.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
self.logger.debug(
f'Successfully registered Page.javascriptDialogOpening handler for session {cdp_session.session_id}'
)
# Also register on root CDP client to catch dialogs from any frame
if hasattr(self.browser_session._cdp_client_root, 'register'):
try:
self.browser_session._cdp_client_root.register.Page.javascriptDialogOpening(handle_dialog) # type: ignore[arg-type]
self.logger.debug('Successfully registered dialog handler on root CDP client for all frames')
except Exception as root_error:
self.logger.warning(f'Failed to register on root CDP client: {root_error}')
# Mark this target as having dialog handling set up
self._dialog_listeners_registered.add(target_id)
self.logger.debug(f'Set up JavaScript dialog handling for tab {target_id}')
except Exception as e:
self.logger.warning(f'Failed to set up dialog handling for tab {target_id}: {e}')
async def on_DialogOpenedEvent(self, event: DialogOpenedEvent) -> None:
"""Handle the async closing of JavaScript dialogs."""
self.logger.debug(
f'📋 on_DialogOpenedEvent called with frame_id={event.frame_id} url={event.url} message={event.message}'
)
assert self.browser_session.agent_focus is not None, 'Agent focus not set when handling DialogOpenedEvent'
current_focus_url = self.browser_session.agent_focus.url
current_focus_target_id = self.browser_session.agent_focus.target_id
cdp_session = await asyncio.wait_for(self.browser_session.cdp_client_for_frame(event.frame_id), timeout=5.0)
try:
# delay to look more human before auto-closing, some popular antibot fingerprint tests check for modals closing too fast
await asyncio.sleep(0.25)
assert self.browser_session._cdp_client_root
# self.browser_session._cdp_client_root.register.Page.javascriptDialogClosed(lambda *args: None)
await asyncio.wait_for(
self.browser_session._cdp_client_root.send.Page.handleJavaScriptDialog(
params={'accept': True},
session_id=cdp_session.session_id,
),
timeout=5.0,
)
# CRITICAL: you must re-focus (Target.activateTarget()) after handling the dialog, otherwise the browser will crash ~5 seconds later
await self.browser_session.get_or_create_cdp_session(target_id=current_focus_target_id, focus=True)
self.logger.info('✅ JS dialog popup handled successfully')
# graveyard of past attempts:
# # new_target = await self.browser_session._cdp_client_root.send.Target.createTarget(params={'url': current_focus_url})
# # self.browser_session.agent_focus = await self.browser_session.get_or_create_cdp_session(target_id=new_target.get('targetId'), new_socket=True, focus=True)
# # raise NotImplementedError('TODO: figure out why this requires a hard refresh and new socket to avoid crashing the entire browser on JS dialogs')
# await asyncio.sleep(0.2)
# await asyncio.wait_for(
# self.browser_session._cdp_client_root.send.Runtime.evaluate(
# params={'expression': '1'},
# session_id=cdp_session.session_id,
# ),
# timeout=5.0,
# )
# # self.browser_session.agent_focus = await self.browser_session.get_or_create_cdp_session(current_focus.target_id, focus=True, new_socket=True)
# # assert await self.browser_session.agent_focus.cdp_client.send.Page.getFrameTree(session_id=self.browser_session.agent_focus.session_id) is not None, "Agent focus not set after handling dialog"
except Exception as e:
self.logger.error(f'Failed to handle JavaScript dialog gracefully: {e}')
# raise
# finally:
# self.event_bus.dispatch(AgentFocusChangedEvent(
# target_id=current_focus_target_id,
# url=self.browser_session.agent_focus.url,
# ))
self.logger.warning(f'Failed to set up popup handling for tab {target_id}: {e}')

View File

@@ -0,0 +1,126 @@
"""Recording Watchdog for Browser Use Sessions."""
import asyncio
from pathlib import Path
from typing import ClassVar
from bubus import BaseEvent
from cdp_use.cdp.page.events import ScreencastFrameEvent
from uuid_extensions import uuid7str
from browser_use.browser.events import BrowserConnectedEvent, BrowserStopEvent
from browser_use.browser.profile import ViewportSize
from browser_use.browser.video_recorder import VideoRecorderService
from browser_use.browser.watchdog_base import BaseWatchdog
class RecordingWatchdog(BaseWatchdog):
"""
Manages video recording of a browser session using CDP screencasting.
"""
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [BrowserConnectedEvent, BrowserStopEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = []
_recorder: VideoRecorderService | None = None
async def on_BrowserConnectedEvent(self, event: BrowserConnectedEvent) -> None:
"""
Starts video recording if it is configured in the browser profile.
"""
profile = self.browser_session.browser_profile
if not profile.record_video_dir:
return
# Dynamically determine video size
size = profile.record_video_size
if not size:
self.logger.debug('record_video_size not specified, detecting viewport size...')
size = await self._get_current_viewport_size()
if not size:
self.logger.warning('Cannot start video recording: viewport size could not be determined.')
return
video_format = getattr(profile, 'record_video_format', 'mp4').strip('.')
output_path = Path(profile.record_video_dir) / f'{uuid7str()}.{video_format}'
self.logger.debug(f'Initializing video recorder for format: {video_format}')
self._recorder = VideoRecorderService(output_path=output_path, size=size, framerate=profile.record_video_framerate)
self._recorder.start()
if not self._recorder._is_active:
self._recorder = None
return
self.browser_session.cdp_client.register.Page.screencastFrame(self.on_screencastFrame)
try:
cdp_session = await self.browser_session.get_or_create_cdp_session()
await cdp_session.cdp_client.send.Page.startScreencast(
params={
'format': 'png',
'quality': 90,
'maxWidth': size['width'],
'maxHeight': size['height'],
'everyNthFrame': 1,
},
session_id=cdp_session.session_id,
)
self.logger.info(f'📹 Started video recording to {output_path}')
except Exception as e:
self.logger.error(f'Failed to start screencast via CDP: {e}')
if self._recorder:
self._recorder.stop_and_save()
self._recorder = None
async def _get_current_viewport_size(self) -> ViewportSize | None:
"""Gets the current viewport size directly from the browser via CDP."""
try:
cdp_session = await self.browser_session.get_or_create_cdp_session()
metrics = await cdp_session.cdp_client.send.Page.getLayoutMetrics(session_id=cdp_session.session_id)
# Use cssVisualViewport for the most accurate representation of the visible area
viewport = metrics.get('cssVisualViewport', {})
width = viewport.get('clientWidth')
height = viewport.get('clientHeight')
if width and height:
self.logger.debug(f'Detected viewport size: {width}x{height}')
return ViewportSize(width=int(width), height=int(height))
except Exception as e:
self.logger.warning(f'Failed to get viewport size from browser: {e}')
return None
def on_screencastFrame(self, event: ScreencastFrameEvent, session_id: str | None) -> None:
"""
Synchronous handler for incoming screencast frames.
"""
if not self._recorder:
return
self._recorder.add_frame(event['data'])
asyncio.create_task(self._ack_screencast_frame(event, session_id))
async def _ack_screencast_frame(self, event: ScreencastFrameEvent, session_id: str | None) -> None:
"""
Asynchronously acknowledges a screencast frame.
"""
try:
await self.browser_session.cdp_client.send.Page.screencastFrameAck(
params={'sessionId': event['sessionId']}, session_id=session_id
)
except Exception as e:
self.logger.debug(f'Failed to acknowledge screencast frame: {e}')
async def on_BrowserStopEvent(self, event: BrowserStopEvent) -> None:
"""
Stops the video recording and finalizes the video file.
"""
if self._recorder:
recorder = self._recorder
self._recorder = None
self.logger.debug('Stopping video recording and saving file...')
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, recorder.stop_and_save)

View File

@@ -156,11 +156,14 @@ class SecurityWatchdog(BaseWatchdog):
return True
else:
# Use fnmatch for other glob patterns
if fnmatch.fnmatch(host, pattern):
if fnmatch.fnmatch(
full_url_pattern if '://' in pattern else host,
pattern,
):
return True
else:
# Exact match
if pattern.startswith(('http://', 'https://', 'chrome://', 'brave://', 'file://')):
if '://' in pattern:
# Full URL pattern
if url.startswith(pattern):
return True

View File

@@ -12,6 +12,7 @@ from pydantic import Field, PrivateAttr
from browser_use.browser.events import (
BrowserConnectedEvent,
BrowserStopEvent,
LoadStorageStateEvent,
SaveStorageStateEvent,
StorageStateLoadedEvent,
@@ -26,6 +27,7 @@ class StorageStateWatchdog(BaseWatchdog):
# Event contracts
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [
BrowserConnectedEvent,
BrowserStopEvent,
SaveStorageStateEvent,
LoadStorageStateEvent,
]
@@ -51,7 +53,12 @@ class StorageStateWatchdog(BaseWatchdog):
await self._start_monitoring()
# Automatically load storage state after browser start
self.event_bus.dispatch(LoadStorageStateEvent())
await self.event_bus.dispatch(LoadStorageStateEvent())
async def on_BrowserStopEvent(self, event: BrowserStopEvent) -> None:
"""Stop monitoring when browser stops."""
self.logger.debug('[StorageStateWatchdog] Stopping storage_state monitoring')
await self._stop_monitoring()
async def on_SaveStorageStateEvent(self, event: SaveStorageStateEvent) -> None:
"""Handle storage state save request."""

View File

@@ -159,6 +159,10 @@ class OldConfig:
def SKIP_LLM_API_KEY_VERIFICATION(self) -> bool:
return os.getenv('SKIP_LLM_API_KEY_VERIFICATION', 'false').lower()[:1] in 'ty1'
@property
def DEFAULT_LLM(self) -> str:
return os.getenv('DEFAULT_LLM', '')
# Runtime hints
@property
def IN_DOCKER(self) -> bool:
@@ -203,6 +207,7 @@ class FlatEnvConfig(BaseSettings):
AZURE_OPENAI_ENDPOINT: str = Field(default='')
AZURE_OPENAI_KEY: str = Field(default='')
SKIP_LLM_API_KEY_VERIFICATION: bool = Field(default=False)
DEFAULT_LLM: str = Field(default='')
# Runtime hints
IN_DOCKER: bool | None = Field(default=None)

View File

@@ -16,32 +16,16 @@ from browser_use.dom.views import DOMRect, EnhancedSnapshotNode
# Only the ESSENTIAL computed styles for interactivity and visibility detection
REQUIRED_COMPUTED_STYLES = [
# Essential for visibility
'display',
'visibility',
'opacity',
'position',
'z-index',
'pointer-events',
'cursor',
'overflow',
'overflow-x',
'overflow-y',
'width',
'height',
'top',
'left',
'right',
'bottom',
'transform',
'clip',
'clip-path',
'user-select',
'background-color',
'color',
'border',
'margin',
'padding',
# Only styles actually accessed in the codebase (prevents Chrome crashes on heavy sites)
'display', # Used in service.py visibility detection
'visibility', # Used in service.py visibility detection
'opacity', # Used in service.py visibility detection
'overflow', # Used in views.py scrollability detection
'overflow-x', # Used in views.py scrollability detection
'overflow-y', # Used in views.py scrollability detection
'cursor', # Used in enhanced_snapshot.py cursor extraction
'pointer-events', # Used for clickability logic
'position', # Used for visibility logic
]
@@ -81,6 +65,14 @@ def build_snapshot_lookup(
for i, backend_node_id in enumerate(nodes['backendNodeId']):
backend_node_to_snapshot_index[backend_node_id] = i
# PERFORMANCE: Pre-build layout index map to eliminate O(n²) double lookups
# Preserve original behavior: use FIRST occurrence for duplicates
layout_index_map = {}
if layout and 'nodeIndex' in layout:
for layout_idx, node_index in enumerate(layout['nodeIndex']):
if node_index not in layout_index_map: # Only store first occurrence
layout_index_map[node_index] = layout_idx
# Build snapshot lookup for each backend node id
for backend_node_id, snapshot_index in backend_node_to_snapshot_index.items():
is_clickable = None
@@ -98,8 +90,9 @@ def build_snapshot_lookup(
client_rects = None
scroll_rects = None
stacking_contexts = None
for layout_idx, node_index in enumerate(layout.get('nodeIndex', [])):
if node_index == snapshot_index and layout_idx < len(layout.get('bounds', [])):
if snapshot_index in layout_index_map:
layout_idx = layout_index_map[snapshot_index]
if layout_idx < len(layout.get('bounds', [])):
# Parse bounding box
bounds = layout['bounds'][layout_idx]
if len(bounds) >= 4:
@@ -153,8 +146,6 @@ def build_snapshot_lookup(
if layout_idx < len(layout.get('stackingContexts', [])):
stacking_contexts = layout.get('stackingContexts', {}).get('index', [])[layout_idx]
break
snapshot_lookup[backend_node_id] = EnhancedSnapshotNode(
is_clickable=is_clickable,
cursor_style=cursor_style,

View File

@@ -6,8 +6,10 @@ from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any
from markdown_pdf import MarkdownPdf, Section
from pydantic import BaseModel, Field
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph, SimpleDocTemplate, Spacer
INVALID_FILENAME_ERROR_MESSAGE = 'Error: Invalid filename format. Must be alphanumeric with supported extension.'
DEFAULT_FILE_SYSTEM_PATH = 'browseruse_agent_data'
@@ -120,9 +122,32 @@ class PdfFile(BaseFile):
def sync_to_disk_sync(self, path: Path) -> None:
file_path = path / self.full_name
try:
md_pdf = MarkdownPdf()
md_pdf.add_section(Section(self.content))
md_pdf.save(file_path)
# Create PDF document
doc = SimpleDocTemplate(str(file_path), pagesize=letter)
styles = getSampleStyleSheet()
story = []
# Convert markdown content to simple text and add to PDF
# For basic implementation, we'll treat content as plain text
# This avoids the AGPL license issue while maintaining functionality
content_lines = self.content.split('\n')
for line in content_lines:
if line.strip():
# Handle basic markdown headers
if line.startswith('# '):
para = Paragraph(line[2:], styles['Title'])
elif line.startswith('## '):
para = Paragraph(line[3:], styles['Heading1'])
elif line.startswith('### '):
para = Paragraph(line[4:], styles['Heading2'])
else:
para = Paragraph(line, styles['Normal'])
story.append(para)
else:
story.append(Spacer(1, 6))
doc.build(story)
except Exception as e:
raise FileSystemError(f"Error: Could not write to file '{self.full_name}'. {str(e)}")

View File

@@ -37,6 +37,41 @@ if TYPE_CHECKING:
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.llm.openrouter.chat import ChatOpenRouter
# Type stubs for model instances - enables IDE autocomplete
openai_gpt_4o: ChatOpenAI
openai_gpt_4o_mini: ChatOpenAI
openai_gpt_4_1_mini: ChatOpenAI
openai_o1: ChatOpenAI
openai_o1_mini: ChatOpenAI
openai_o1_pro: ChatOpenAI
openai_o3: ChatOpenAI
openai_o3_mini: ChatOpenAI
openai_o3_pro: ChatOpenAI
openai_o4_mini: ChatOpenAI
openai_gpt_5: ChatOpenAI
openai_gpt_5_mini: ChatOpenAI
openai_gpt_5_nano: ChatOpenAI
azure_gpt_4o: ChatAzureOpenAI
azure_gpt_4o_mini: ChatAzureOpenAI
azure_gpt_4_1_mini: ChatAzureOpenAI
azure_o1: ChatAzureOpenAI
azure_o1_mini: ChatAzureOpenAI
azure_o1_pro: ChatAzureOpenAI
azure_o3: ChatAzureOpenAI
azure_o3_mini: ChatAzureOpenAI
azure_o3_pro: ChatAzureOpenAI
azure_gpt_5: ChatAzureOpenAI
azure_gpt_5_mini: ChatAzureOpenAI
google_gemini_2_0_flash: ChatGoogle
google_gemini_2_0_pro: ChatGoogle
google_gemini_2_5_pro: ChatGoogle
google_gemini_2_5_flash: ChatGoogle
google_gemini_2_5_flash_lite: ChatGoogle
# Models are imported on-demand via __getattr__
# Lazy imports mapping for heavy chat models
_LAZY_IMPORTS = {
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
@@ -51,9 +86,12 @@ _LAZY_IMPORTS = {
'ChatOpenRouter': ('browser_use.llm.openrouter.chat', 'ChatOpenRouter'),
}
# Cache for model instances - only created when accessed
_model_cache: dict[str, 'BaseChatModel'] = {}
def __getattr__(name: str):
"""Lazy import mechanism for heavy chat model imports."""
"""Lazy import mechanism for heavy chat model imports and model instances."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
@@ -61,12 +99,25 @@ def __getattr__(name: str):
module = import_module(module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
# Check cache first for model instances
if name in _model_cache:
return _model_cache[name]
# Try to get model instances from models module on-demand
try:
from browser_use.llm.models import __getattr__ as models_getattr
attr = models_getattr(name)
# Cache in our clean cache dict
_model_cache[name] = attr
return attr
except (AttributeError, ImportError):
pass
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")

View File

@@ -73,10 +73,11 @@ class ChatGoogle(BaseChatModel):
# Model configuration
model: VerifiedGeminiModels | str
temperature: float | None = None
temperature: float | None = 0.2
top_p: float | None = None
seed: int | None = None
thinking_budget: int | None = None
max_output_tokens: int | None = 4096
config: types.GenerateContentConfigDict | None = None
# Client initialization parameters
@@ -193,6 +194,9 @@ class ChatGoogle(BaseChatModel):
thinking_config_dict: types.ThinkingConfigDict = {'thinking_budget': self.thinking_budget}
config['thinking_config'] = thinking_config_dict
if self.max_output_tokens is not None:
config['max_output_tokens'] = self.max_output_tokens
async def _make_api_call():
if output_format is None:
# Return string response
@@ -389,6 +393,10 @@ class ChatGoogle(BaseChatModel):
):
cleaned['properties'] = {'_placeholder': {'type': 'string'}}
# Also remove 'title' from the required list if it exists
if 'required' in cleaned and isinstance(cleaned.get('required'), list):
cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
return cleaned
elif isinstance(obj, list):
return [clean_schema(item) for item in obj]

171
browser_use/llm/models.py Normal file
View File

@@ -0,0 +1,171 @@
"""
Convenient access to LLM models.
Usage:
from browser_use import llm
# Simple model access
model = llm.azure_gpt_4_1_mini
model = llm.openai_gpt_4o
model = llm.google_gemini_2_5_pro
"""
import os
from typing import TYPE_CHECKING
from browser_use.llm.azure.chat import ChatAzureOpenAI
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.openai.chat import ChatOpenAI
if TYPE_CHECKING:
from browser_use.llm.base import BaseChatModel
# Type stubs for IDE autocomplete
openai_gpt_4o: 'BaseChatModel'
openai_gpt_4o_mini: 'BaseChatModel'
openai_gpt_4_1_mini: 'BaseChatModel'
openai_o1: 'BaseChatModel'
openai_o1_mini: 'BaseChatModel'
openai_o1_pro: 'BaseChatModel'
openai_o3: 'BaseChatModel'
openai_o3_mini: 'BaseChatModel'
openai_o3_pro: 'BaseChatModel'
openai_o4_mini: 'BaseChatModel'
openai_gpt_5: 'BaseChatModel'
openai_gpt_5_mini: 'BaseChatModel'
openai_gpt_5_nano: 'BaseChatModel'
azure_gpt_4o: 'BaseChatModel'
azure_gpt_4o_mini: 'BaseChatModel'
azure_gpt_4_1_mini: 'BaseChatModel'
azure_o1: 'BaseChatModel'
azure_o1_mini: 'BaseChatModel'
azure_o1_pro: 'BaseChatModel'
azure_o3: 'BaseChatModel'
azure_o3_mini: 'BaseChatModel'
azure_o3_pro: 'BaseChatModel'
azure_gpt_5: 'BaseChatModel'
azure_gpt_5_mini: 'BaseChatModel'
google_gemini_2_0_flash: 'BaseChatModel'
google_gemini_2_0_pro: 'BaseChatModel'
google_gemini_2_5_pro: 'BaseChatModel'
google_gemini_2_5_flash: 'BaseChatModel'
google_gemini_2_5_flash_lite: 'BaseChatModel'
def get_llm_by_name(model_name: str):
"""
Factory function to create LLM instances from string names with API keys from environment.
Args:
model_name: String name like 'azure_gpt_4_1_mini', 'openai_gpt_4o', etc.
Returns:
LLM instance with API keys from environment variables
Raises:
ValueError: If model_name is not recognized
"""
if not model_name:
raise ValueError('Model name cannot be empty')
# Parse model name
parts = model_name.split('_', 1)
if len(parts) < 2:
raise ValueError(f"Invalid model name format: '{model_name}'. Expected format: 'provider_model_name'")
provider = parts[0]
model_part = parts[1]
# Convert underscores back to dots/dashes for actual model names
if 'gpt_4_1_mini' in model_part:
model = model_part.replace('gpt_4_1_mini', 'gpt-4.1-mini')
elif 'gpt_4o_mini' in model_part:
model = model_part.replace('gpt_4o_mini', 'gpt-4o-mini')
elif 'gpt_4o' in model_part:
model = model_part.replace('gpt_4o', 'gpt-4o')
elif 'gemini_2_0' in model_part:
model = model_part.replace('gemini_2_0', 'gemini-2.0').replace('_', '-')
elif 'gemini_2_5' in model_part:
model = model_part.replace('gemini_2_5', 'gemini-2.5').replace('_', '-')
else:
model = model_part.replace('_', '-')
# OpenAI Models
if provider == 'openai':
api_key = os.getenv('OPENAI_API_KEY')
return ChatOpenAI(model=model, api_key=api_key)
# Azure OpenAI Models
elif provider == 'azure':
api_key = os.getenv('AZURE_OPENAI_KEY') or os.getenv('AZURE_OPENAI_API_KEY')
azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
return ChatAzureOpenAI(model=model, api_key=api_key, azure_endpoint=azure_endpoint)
# Google Models
elif provider == 'google':
api_key = os.getenv('GOOGLE_API_KEY')
return ChatGoogle(model=model, api_key=api_key)
else:
available_providers = ['openai', 'azure', 'google']
raise ValueError(f"Unknown provider: '{provider}'. Available providers: {', '.join(available_providers)}")
# Pre-configured model instances (lazy loaded via __getattr__)
def __getattr__(name: str) -> 'BaseChatModel':
"""Create model instances on demand with API keys from environment."""
# Handle chat classes first
if name == 'ChatOpenAI':
return ChatOpenAI # type: ignore
elif name == 'ChatAzureOpenAI':
return ChatAzureOpenAI # type: ignore
elif name == 'ChatGoogle':
return ChatGoogle # type: ignore
# Handle model instances - these are the main use case
try:
return get_llm_by_name(name)
except ValueError:
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = [
'ChatOpenAI',
'ChatAzureOpenAI',
'ChatGoogle',
'get_llm_by_name',
# OpenAI instances - created on demand
'openai_gpt_4o',
'openai_gpt_4o_mini',
'openai_gpt_4_1_mini',
'openai_o1',
'openai_o1_mini',
'openai_o1_pro',
'openai_o3',
'openai_o3_mini',
'openai_o3_pro',
'openai_o4_mini',
'openai_gpt_5',
'openai_gpt_5_mini',
'openai_gpt_5_nano',
# Azure instances - created on demand
'azure_gpt_4o',
'azure_gpt_4o_mini',
'azure_gpt_4_1_mini',
'azure_o1',
'azure_o1_mini',
'azure_o1_pro',
'azure_o3',
'azure_o3_mini',
'azure_o3_pro',
'azure_gpt_5',
'azure_gpt_5_mini',
# Google instances - created on demand
'google_gemini_2_0_flash',
'google_gemini_2_0_pro',
'google_gemini_2_5_pro',
'google_gemini_2_5_flash',
'google_gemini_2_5_flash_lite',
]

View File

@@ -1,8 +1,10 @@
from collections.abc import Mapping
from dataclasses import dataclass
from typing import Any, TypeVar, overload
import httpx
from ollama import AsyncClient as OllamaAsyncClient
from ollama import Options
from pydantic import BaseModel
from browser_use.llm.base import BaseChatModel
@@ -30,6 +32,7 @@ class ChatOllama(BaseChatModel):
host: str | None = None
timeout: float | httpx.Timeout | None = None
client_params: dict[str, Any] | None = None
ollama_options: Mapping[str, Any] | Options | None = None
# Static
@property
@@ -70,6 +73,7 @@ class ChatOllama(BaseChatModel):
response = await self.get_client().chat(
model=self.model,
messages=ollama_messages,
options=self.ollama_options,
)
return ChatInvokeCompletion(completion=response.message.content or '', usage=None)
@@ -80,6 +84,7 @@ class ChatOllama(BaseChatModel):
model=self.model,
messages=ollama_messages,
format=schema,
options=self.ollama_options,
)
completion = response.message.content or ''

View File

@@ -3,7 +3,6 @@ import base64
import io
import random
from lmnr import Laminar
from PIL import Image, ImageDraw, ImageFont
from browser_use.llm.google.chat import ChatGoogle
@@ -17,8 +16,6 @@ from browser_use.llm.messages import (
UserMessage,
)
Laminar.initialize()
def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str:
# Create image with random background color

View File

@@ -138,7 +138,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
# Create debug log file handler
if debug_log_file:
debug_handler = logging.FileHandler(debug_log_file)
debug_handler = logging.FileHandler(debug_log_file, encoding='utf-8')
debug_handler.setLevel(logging.DEBUG)
debug_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.DEBUG))
file_handlers.append(debug_handler)
@@ -146,7 +146,7 @@ def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file
# Create info log file handler
if info_log_file:
info_handler = logging.FileHandler(info_log_file)
info_handler = logging.FileHandler(info_log_file, encoding='utf-8')
info_handler.setLevel(logging.INFO)
info_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.INFO))
file_handlers.append(info_handler)

View File

@@ -8,6 +8,7 @@ from inspect import Parameter, iscoroutinefunction, signature
from types import UnionType
from typing import Any, Generic, Optional, TypeVar, Union, get_args, get_origin
import pyotp
from pydantic import BaseModel, Field, RootModel, create_model
from browser_use.browser import BrowserSession
@@ -433,10 +434,17 @@ class Registry(Generic[Context]):
def recursively_replace_secrets(value: str | dict | list) -> str | dict | list:
if isinstance(value, str):
matches = secret_pattern.findall(value)
# check if the placeholder key, like x_password is in the output parameters of the LLM and replace it with the sensitive data
for placeholder in matches:
if placeholder in applicable_secrets:
value = value.replace(f'<secret>{placeholder}</secret>', applicable_secrets[placeholder])
# generate a totp code if secret is a 2fa secret
if 'bu_2fa_code' in placeholder:
totp = pyotp.TOTP(applicable_secrets[placeholder], digits=6)
replacement_value = totp.now()
else:
replacement_value = applicable_secrets[placeholder]
value = value.replace(f'<secret>{placeholder}</secret>', replacement_value)
replaced_placeholders.add(placeholder)
else:
# Keep track of missing placeholders

View File

@@ -236,17 +236,17 @@ class Tools(Generic[Context]):
return ActionResult(error=error_msg)
@self.registry.action(
'Wait for x seconds default 3 (max 10 seconds). This can be used to wait until the page is fully loaded.'
'Wait for x seconds (default 3) (max 30 seconds). This can be used to wait until the page is fully loaded.'
)
async def wait(seconds: int = 3):
# Cap wait time at maximum 10 seconds
# Cap wait time at maximum 30 seconds
# Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
# So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds
# Note by Mert: the above doesnt make sense because we do the LLM call right after this or this could be followed by another action after which we would like to wait
# so I revert this.
actual_seconds = min(max(seconds, 0), 10)
memory = f'Waited for {actual_seconds} seconds'
logger.info(f'🕒 {memory}')
actual_seconds = min(max(seconds - 3, 0), 30)
memory = f'Waited for {seconds} seconds'
logger.info(f'🕒 waited for {actual_seconds} seconds + 3 seconds for LLM call')
await asyncio.sleep(actual_seconds)
return ActionResult(extracted_content=memory, long_term_memory=memory)
@@ -266,7 +266,7 @@ class Tools(Generic[Context]):
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
raise ValueError(f'Element index {params.index} not found in DOM')
raise ValueError(f'Element index {params.index} not found in browser state')
event = browser_session.event_bus.dispatch(
ClickElementEvent(node=node, while_holding_ctrl=params.while_holding_ctrl or False)
@@ -315,7 +315,7 @@ class Tools(Generic[Context]):
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
raise ValueError(f'Element index {params.index} not found in DOM')
raise ValueError(f'Element index {params.index} not found in browser state')
# Dispatch type text event with node
try:
@@ -325,7 +325,7 @@ class Tools(Generic[Context]):
await event
input_metadata = await event.event_result(raise_if_any=True, raise_if_none=False)
msg = f"Input '{params.text}' into element {params.index}."
logger.info(msg)
logger.debug(msg)
# Include input coordinates in metadata if available
return ActionResult(
@@ -669,7 +669,9 @@ You will be given a query and the markdown of a webpage that has been filtered t
raise RuntimeError(str(e))
@self.registry.action(
"""Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.). Optional index parameter to scroll within a specific element or its scroll container (works well for dropdowns and custom UI components). If you want to scroll the entire page, don't use index.
"""Scroll the page by specified number of pages (set down=True to scroll down, down=False to scroll up, num_pages=number of pages to scroll like 0.5 for half page, 10.0 for ten pages, etc.).
Default behavior is to scroll the entire page. This is enough for most cases.
Optional if there are multiple scroll containers, use frame_element_index parameter with an element inside the container you want to scroll in. For that you must use indices that exist in your browser_state (works well for dropdowns and custom UI components).
Instead of scrolling step after step, use a high number of pages at once like 10 to get to the bottom of the page.
If you know where you want to scroll to, use scroll_to_text instead of this tool.
""",
@@ -681,18 +683,15 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Special case: index 0 means scroll the whole page (root/body element)
node = None
if params.frame_element_index is not None and params.frame_element_index != 0:
try:
node = await browser_session.get_element_by_index(params.frame_element_index)
if node is None:
# Element not found - return error
raise ValueError(f'Element index {params.frame_element_index} not found in DOM')
except Exception as e:
# Error getting element - return error
raise ValueError(f'Failed to get element {params.frame_element_index}: {e}') from e
node = await browser_session.get_element_by_index(params.frame_element_index)
if node is None:
# Element does not exist
msg = f'Element index {params.frame_element_index} not found in browser state'
return ActionResult(error=msg)
# Dispatch scroll event with node - the complex logic is handled in the event handler
# Convert pages to pixels (assuming 800px per page as standard viewport height)
pixels = int(params.num_pages * 800)
# Convert pages to pixels (assuming 1000px per page as standard viewport height)
pixels = int(params.num_pages * 1000)
event = browser_session.event_bus.dispatch(
ScrollEvent(direction='down' if params.down else 'up', amount=pixels, node=node)
)
@@ -765,7 +764,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Dropdown Actions
@self.registry.action(
'Get list of option values exposed by a specific dropdown input field. Only works on dropdown-style form elements (<select>, Semantic UI/aria-labeled select, etc.).',
'Get list of values for a dropdown input field. Only works on dropdown-style form elements (<select>, Semantic UI/aria-labeled select, etc.). Do not use this tool for none dropdown elements.',
param_model=GetDropdownOptionsAction,
)
async def get_dropdown_options(params: GetDropdownOptionsAction, browser_session: BrowserSession):
@@ -773,7 +772,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
raise ValueError(f'Element index {params.index} not found in DOM')
raise ValueError(f'Element index {params.index} not found in browser state')
# Dispatch GetDropdownOptionsEvent to the event handler
@@ -799,7 +798,7 @@ You will be given a query and the markdown of a webpage that has been filtered t
# Look up the node from the selector map
node = await browser_session.get_element_by_index(params.index)
if node is None:
raise ValueError(f'Element index {params.index} not found in DOM')
raise ValueError(f'Element index {params.index} not found in browser state')
# Dispatch SelectDropdownOptionEvent to the event handler
from browser_use.browser.events import SelectDropdownOptionEvent

View File

@@ -2,6 +2,7 @@ import asyncio
import logging
import os
import platform
import re
import signal
import time
from collections.abc import Callable, Coroutine
@@ -16,6 +17,9 @@ from dotenv import load_dotenv
load_dotenv()
# Pre-compiled regex for URL detection - used in URL shortening
URL_PATTERN = re.compile(r'https?://[^\s<>"\']+|www\.[^\s<>"\']+|[^\s<>"\']+\.[a-z]{2,}(?:/[^\s<>"\']*)?', re.IGNORECASE)
logger = logging.getLogger(__name__)

View File

@@ -1,146 +0,0 @@
---
title: "Node.js"
description: "Get started with Browser Use Cloud API using Node.js"
icon: "node-js"
mode: "wide"
---
<img src="/images/cloud-banner-js.png" alt="Browser Use Node.js" width="full" />
> The repository is available on [GitHub](https://github.com/browser-use/browser-use-node)
<CodeGroup>
```sh npm
npm install browser-use-sdk
```
```sh pnpm
pnpm add browser-use-sdk
```
```sh yarn
yarn add browser-use-sdk
```
```sh bun
bun add browser-use-sdk
```
</CodeGroup>
☝️ Get your API Key at [Browser Use Cloud](https://cloud.browser-use.com/billing)
```ts
import BrowserUse from "browser-use-sdk";
const client = new BrowserUse({
apiKey: "bu_...",
});
const result = await client.tasks.run({
task: "Search for the top 10 Hacker News posts and return the title and url.",
});
console.log(result.doneOutput);
```
> The full API of this library can be found in [api.md](https://github.com/browser-use/browser-use-node/blob/main/api.md).
### Structured Output with Zod
```ts
import z from "zod";
const TaskOutput = z.object({
posts: z.array(
z.object({
title: z.string(),
url: z.string(),
})
),
});
const result = await client.tasks.run({
task: "Search for the top 10 Hacker News posts and return the title and url.",
schema: TaskOutput,
});
for (const post of result.parsedOutput.posts) {
console.log(`${post.title} - ${post.url}`);
}
```
### Streaming Agent Updates
```ts
const task = await browseruse.tasks.create({
task: "Search for the top 10 Hacker News posts and return the title and url.",
schema: TaskOutput,
});
const stream = browseruse.tasks.stream({
taskId: task.id,
schema: TaskOutput,
});
for await (const msg of stream) {
switch (msg.status) {
case "started":
console.log(`started: ${msg.data.session.liveUrl}`);
break;
case "paused":
case "stopped":
console.log(`running: ${msg}`);
break;
case "finished":
console.log(`done:`);
for (const post of msg.parsedOutput.posts) {
console.log(`${post.title} - ${post.url}`);
}
break;
}
}
```
## Webhook Verification
> We encourage you to use the SDK functions that verify and parse webhook events.
```ts
import {
verifyWebhookEventSignature,
type WebhookAgentTaskStatusUpdatePayload,
} from "browser-use-sdk/lib/webhooks";
export async function POST(req: Request) {
const signature = req.headers["x-browser-use-signature"] as string;
const timestamp = req.headers["x-browser-use-timestamp"] as string;
const event = await verifyWebhookEventSignature(
{
body,
signature,
timestamp,
},
{
secret: SECRET_KEY,
}
);
if (!event.ok) {
return;
}
switch (event.event.type) {
case "agent.task.status_update":
break;
case "test":
break;
default:
break;
}
}
```

View File

@@ -1,131 +0,0 @@
---
title: "Python"
description: "Get started with Browser Use Cloud API using Python"
icon: "python"
mode: "wide"
---
<img
src="/images/cloud-banner-python.png"
alt="Browser Use Python"
width="full"
/>
> The repository is available on [GitHub](https://github.com/browser-use/browser-use-python).
```sh
pip install browser-use-sdk
```
☝️ Get your API Key at [Browser Use Cloud](https://cloud.browser-use.com/billing)
```python
from browser_use_sdk import BrowserUse
client = BrowserUse(api_key="bu_...")
result = client.tasks.run(
task="Search for the top 10 Hacker News posts and return the title and url."
)
result.done_output
```
> The full API reference can be found in [api.md](https://github.com/browser-use/browser-use-python/blob/main/api.md).
## Async usage
Simply import `AsyncBrowserUse` instead of `BrowserUse` and use `await` with each API call:
```python
import os
import asyncio
from browser_use_sdk import AsyncBrowserUse
client = AsyncBrowserUse(
api_key=os.environ.get("BROWSER_USE_API_KEY"), # This is the default and can be omitted
)
async def main() -> None:
task = await client.tasks.run(
task="Search for the top 10 Hacker News posts and return the title and url.",
)
print(task.done_output)
asyncio.run(main())
```
Functionality between the synchronous and asynchronous clients is otherwise identical.
## Structured Output with Pydantic
Browser Use Python SDK provides first class support for Pydantic models.
```py
class HackerNewsPost(BaseModel):
title: str
url: str
class SearchResult(BaseModel):
posts: List[HackerNewsPost]
async def main() -> None:
structured_result = await client.tasks.run(
task="""
Find top 10 Hacker News articles and return the title and url.
""",
structured_output_json=SearchResult,
)
if structured_result.parsed_output is not None:
print("Top HackerNews Posts:")
for post in structured_result.parsed_output.posts:
print(f" - {post.title} - {post.url}")
asyncio.run(main())
```
## Streaming Updates with Async Iterators
```py
class HackerNewsPost(BaseModel):
title: str
url: str
class SearchResult(BaseModel):
posts: List[HackerNewsPost]
async def main() -> None:
structured_task = await client.tasks.create(
task="""
Find top 10 Hacker News articles and return the title and url.
""",
structured_output_json=SearchResult,
)
async for update in client.tasks.stream(structured_task.id, structured_output_json=SearchResult):
if len(update.steps) > 0:
last_step = update.steps[-1]
print(f"{update.status}: {last_step.url} ({last_step.next_goal})")
else:
print(f"{update.status}")
if update.status == "finished":
if update.parsed_output is None:
print("No output...")
else:
print("Top HackerNews Posts:")
for post in update.parsed_output.posts:
print(f" - {post.title} - {post.url}")
break
asyncio.run(main())
```
## Advanced
For more advanced usage of the SDK and contributions to the SDK, see [Github repository](https://github.com/browser-use/browser-use-python).

View File

@@ -1,79 +0,0 @@
---
title: "Quickstart"
description: "Skip the setup with Browser Use Cloud"
icon: "cloud"
mode: "wide"
---
<img
className="block dark:hidden rounded-2xl"
src="/images/cloud-banner.png"
alt="Browser Use Cloud Banner"
/>
<img
className="hidden dark:block rounded-2xl"
src="/images/cloud-banner-dark.png"
alt="Browser Use Cloud Banner"
/>
## Get Started
☝️ Get your API Key at [Browser Use Cloud](https://cloud.browser-use.com) then choose your language.
<CardGroup cols={2}>
<Card
title="Python SDK"
icon="python"
href="/cloud/v2/python-quickstart"
>
Browser Use NPC Mode SDK 🤖
</Card>
<Card
title="Node.js SDK"
icon="node-js"
href="/cloud/v2/node-quickstart"
>
Browser Use Wizard Mode SDK 🧙‍♂️
</Card>
</CardGroup>
{/* <br /> */}
> To play around with the API, you can use the [Browser Use Cloud Playground](https://cloud.browser-use.com/playground).
## Examples
Explore quick start examples to see how to use the SDKs.
<CardGroup cols={2}>
<Card
title="Python Examples"
icon="python"
href="https://github.com/browser-use/browser-use-examples/tree/main/python"
>
Explore quick start examples for Python.
</Card>
<Card
title="Typescript Examples"
icon="js"
href="https://github.com/browser-use/browser-use-examples/tree/main/typescript"
>
Explore quick start examples for Typescript.
</Card>
<Card
title="NextJS Examples"
icon={<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="currentColor" className="remixicon text-basis h-8 w-8 text-primary"><path d="M12 22C6.47715 22 2 17.5228 2 12C2 6.47715 6.47715 2 12 2C17.5228 2 22 6.47715 22 12C22 17.5228 17.5228 22 12 22ZM15.9999 8H14.6499V12H15.9999V8ZM9.34609 9.70937L15.405 17.5379L16.4591 16.7293L9.68281 8H8V15.9969H9.34609V9.70937Z"></path></svg>}
href="https://github.com/browser-use/browser-use-examples/tree/main/typescript/scrapper"
>
Explore quick start examples for NextJS.
</Card>
</CardGroup>

View File

@@ -21,6 +21,7 @@ mode: "wide"
- `initial_actions`: List of actions to run before the main task without LLM. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py)
- `max_actions_per_step` (default: `10`): Maximum actions per step, e.g. for form filling the agent can output 10 fields at once. We execute the actions until the page changes.
- `max_failures` (default: `3`): Maximum retries for steps with errors
- `final_response_after_failure` (default: `True`): If True, attempt to force one final model call with intermediate output after max_failures is reached
- `use_thinking` (default: `True`): Controls whether the agent uses its internal "thinking" field for explicit reasoning steps.
- `flash_mode` (default: `False`): Fast mode that skips evaluation, next goal and thinking and only uses memory. If `flash_mode` is enabled, it overrides `use_thinking` and disables the thinking process entirely. [Example](https://github.com/browser-use/browser-use/blob/main/examples/getting_started/05_fast_agent.py)

View File

@@ -73,7 +73,9 @@ mode: "wide"
- `screen`: Screen size information, same format as `window_size`
## Recording & Debugging
- `record_video_dir`: Directory to save video recordings as `.webm` files
- `record_video_dir`: Directory to save video recordings as `.mp4` files
- `record_video_size` (default: `ViewportSize`): The frame size (width, height) of the video recording.
- `record_video_framerate` (default: `30`): The framerate to use for the video recording.
- `record_har_path`: Path to save network trace files as `.har` format
- `traces_dir`: Directory to save complete trace files for debugging
- `record_har_content` (default: `'embed'`): HAR content mode (`'omit'`, `'embed'`, `'attach'`)

View File

@@ -0,0 +1,92 @@
---
title: "Prompting Guide"
description: "Tips and tricks "
icon: "lightbulb"
---
Prompting can trasticly improve performance and solve existing limitations of the library.
### 1. Be Specific vs Open-Ended
**✅ Specific (Recommended)**
```python
task = """
1. Go to https://quotes.toscrape.com/
2. Use extract_structured_data action with the query "first 3 quotes with their authors"
3. Save results to quotes.csv using write_file action
4. Do a google search for the first quote and find when it was written
"""
```
**❌ Open-Ended**
```python
task = "Go to web and make money"
```
### 2. Name Actions Directly
When you know exactly what the agent should do, reference actions by name:
```python
task = """
1. Use search_google action to find "Python tutorials"
2. Use click_element_by_index to open first result in a new tab
3. Use scroll action to scroll down 2 pages
4. Use extract_structured_data to extract the names of the first 5 items
5. Wait for 2 seconds if the page is not loaded, refresh it and wait 10 sec
6. Use send_keys action with "Tab Tab ArrowDown Enter"
"""
```
See [Available Tools](/customize/tools/available) for the complete list of actions.
### 3. Handle interaction problems via keyboard navigation
Sometimes buttons can't be clicked (you found a bug in the library - open an issue).
Good news - often you can work around it with keyboard navigation!
```python
task = """
If the submit button cannot be clicked:
1. Use send_keys action with "Tab Tab Enter" to navigate and activate
2. Or use send_keys with "ArrowDown ArrowDown Enter" for form submission
"""
```
### 4. Custom Actions Integration
```python
# When you have custom actions
@controller.action("Get 2FA code from authenticator app")
async def get_2fa_code():
# Your implementation
pass
task = """
Login with 2FA:
1. Enter username/password
2. When prompted for 2FA, use get_2fa_code action
3. NEVER try to extract 2FA codes from the page manually
4. ALWAYS use the get_2fa_code action for authentication codes
"""
```
### 5. Error Recovery
```python
task = """
Robust data extraction:
1. Go to openai.com to find their CEO
2. If navigation fails due to anti-bot protection:
- Use google search to find the CEO
3. If page times out, use go_back and try alternative approach
"""
```
The key to effective prompting is being specific about actions.

View File

@@ -1,6 +1,6 @@
---
title: "Sensitive Data"
description: "Handle sensitive information securely and avoid sending PII & passwords to the LLM."
description: "Handle secret information securely and avoid sending PII & passwords to the LLM."
icon: "shield"
mode: "wide"
---
@@ -11,14 +11,24 @@ import os
from browser_use import Agent, Browser, ChatOpenAI
os.environ['ANONYMIZED_TELEMETRY'] = "false"
company_credentials = {'x_user': 'your-real-username@email.com', 'x_pass': 'your-real-password123'}
# Option 1: Secrets available for all websites
sensitive_data = company_credentials
# Option 2: Secrets per domain with regex
# sensitive_data = {
# 'https://*.example-staging.com': company_credentials,
# 'http*://test.example.com': company_credentials,
# 'https://example.com': company_credentials,
# 'https://google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
# }
agent = Agent(
task='Log into example.com with username x_user and password x_pass',
sensitive_data={
'https://example.com': {
'x_user': 'your-real-username@email.com',
'x_pass': 'your-real-password123',
},
},
sensitive_data=sensitive_data,
use_vision=False, # Disable vision to prevent LLM seeing sensitive data in screenshots
llm=ChatOpenAI(model='gpt-4.1-mini'),
)

View File

@@ -2,7 +2,6 @@
title: "Lifecycle Hooks"
description: "Customize agent behavior with lifecycle hooks"
icon: "Wrench"
author: "Carlos A. Planchón"
mode: "wide"
---
@@ -27,55 +26,63 @@ Each hook should be an `async` callable function that accepts the `agent` instan
### Basic Example
```python
import asyncio
from pathlib import Path
from browser_use import Agent, ChatOpenAI
from browser_use.browser.events import ScreenshotEvent
async def my_step_hook(agent: Agent):
# inside a hook you can access all the state and methods under the Agent object:
# agent.settings, agent.state, agent.task
# agent.tools, agent.llm, agent.browser_session
# agent.pause(), agent.resume(), agent.add_new_task(...), etc.
# inside a hook you can access all the state and methods under the Agent object:
# agent.settings, agent.state, agent.task
# agent.tools, agent.llm, agent.browser_session
# agent.pause(), agent.resume(), agent.add_new_task(...), etc.
# You also have direct access to the browser state
state = await agent.browser_session.get_browser_state_summary()
current_url = state.url
visit_log = agent.history.urls()
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
print(f"Agent was last on URL: {previous_url} and is now on {current_url}")
# You also have direct access to the browser state
state = await agent.browser_session.get_browser_state_summary()
# Example: listen for events on the page, interact with the DOM, run JS directly, etc.
await page.on('domcontentloaded', lambda: print('page navigated to a new url...'))
await page.locator("css=form > input[type=submit]").click()
await page.evaluate('() => alert(1)')
await page.browser.new_tab
await agent.browser_session.session.context.add_init_script('/* some JS to run on every page */')
current_url = state.url
visit_log = agent.history.urls()
previous_url = visit_log[-2] if len(visit_log) >= 2 else None
print(f'Agent was last on URL: {previous_url} and is now on {current_url}')
cdp_session = await agent.browser_session.get_or_create_cdp_session()
# Example: monitor or intercept all network requests
async def handle_request(route):
# Print, modify, block, etc. do anything to the requests here
# https://playwright.dev/python/docs/network#handle-requests
print(route.request, route.request.headers)
await route.continue_(headers=route.request.headers)
await page.route("**/*", handle_route)
# Example: Get page HTML content
doc = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
params={'nodeId': doc['root']['nodeId']}, session_id=cdp_session.session_id
)
page_html = html_result['outerHTML']
# Example: pause agent execution and resume it based on some custom code
if '/completed' in current_url:
agent.pause()
Path('result.txt').write_text(await page.content())
input('Saved "completed" page content to result.txt, press [Enter] to resume...')
agent.resume()
# Example: Take a screenshot using the event system
screenshot_event = agent.browser_session.event_bus.dispatch(ScreenshotEvent(full_page=False))
await screenshot_event
result = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True)
agent = Agent(
task="Search for the latest news about AI",
llm=ChatOpenAI(model="gpt-4.1-mini"),
)
# Example: pause agent execution and resume it based on some custom code
if '/finished' in current_url:
agent.pause()
Path('result.txt').write_text(page_html)
input('Saved "finished" page content to result.txt, press [Enter] to resume...')
agent.resume()
await agent.run(
on_step_start=my_step_hook,
# on_step_end=...
max_steps=10
)
async def main():
agent = Agent(
task='Search for the latest news about AI',
llm=ChatOpenAI(model='gpt-5-mini'),
)
await agent.run(
on_step_start=my_step_hook,
# on_step_end=...
max_steps=10,
)
if __name__ == '__main__':
asyncio.run(main())
```
## Data Available in Hooks
@@ -96,287 +103,17 @@ When working with agent hooks, you have access to the entire `Agent` instance. H
- `agent.history.model_actions()`: Actions taken by the agent
- `agent.history.extracted_content()`: Content extracted from web pages
- `agent.history.urls()`: URLs visited by the agent
- `agent.browser_session` gives direct access to the `Browser()` and CDP interface
- `agent.browser_session` gives direct access to the `BrowserSession` and CDP interface
- `agent.browser_session.agent_focus`: Get the current CDP session the agent is focused on
- `agent.browser_session.get_or_create_cdp_session()`: Get the current CDP session for browser interaction
- `agent.browser_session.get_tabs()`: Get all tabs currently open
- `agent.browser_session.get_page_html()`: Current page HTML
- `agent.browser_session.take_screenshot()`: Screenshot of the current page
- `agent.browser_session.get_current_page_url()`: Get the URL of the current active tab
- `agent.browser_session.get_current_page_title()`: Get the title of the current active tab
## Tips for Using Hooks
- **Avoid blocking operations**: Since hooks run in the same execution thread as the agent, try to keep them efficient or use asynchronous patterns.
- **Handle exceptions**: Make sure your hook functions handle exceptions gracefully to prevent interrupting the agent's main flow.
- **Use custom actions instead**: hooks are fairly advanced, most things can be implemented with [custom action functions](/customize/custom-functions) instead
- **Avoid blocking operations**: Since hooks run in the same execution thread as the agent, keep them efficient and avoid blocking operations.
- **Use custom tools instead**: hooks are fairly advanced, most things can be implemented with [custom tools](/customize/tools/basics) instead
- **Increase step_timeout**: If your hook is doing something that takes a long time, you can increase the `step_timeout` parameter in the `Agent(...)` constructor.
---
## Complex Example: Agent Activity Recording System
This comprehensive example demonstrates a complete implementation for recording and saving Browser-Use agent activity, consisting of both server and client components.
### Setup Instructions
To use this example, you'll need to:
1. Set up the required dependencies:
```bash
pip install fastapi uvicorn prettyprinter pyobjtojson dotenv browser-use
```
2. Create two separate Python files:
- `api.py` - The FastAPI server component
- `client.py` - The Browser-Use agent with recording hook
3. Run both components:
- Start the API server first: `python api.py`
- Then run the client: `python client.py`
### Server Component (api.py)
The server component handles receiving and storing the agent's activity data:
```python
#!/usr/bin/env python3
#
# FastAPI API to record and save Browser-Use activity data.
# Save this code to api.py and run with `python api.py`
#
import json
import base64
from pathlib import Path
from fastapi import FastAPI, Request
import prettyprinter
import uvicorn
prettyprinter.install_extras()
# Utility function to save screenshots
def b64_to_png(b64_string: str, output_file):
"""
Convert a Base64-encoded string to a PNG file.
:param b64_string: A string containing Base64-encoded data
:param output_file: The path to the output PNG file
"""
with open(output_file, "wb") as f:
f.write(base64.b64decode(b64_string))
# Initialize FastAPI app
app = FastAPI()
@app.post("/post_agent_history_step")
async def post_agent_history_step(request: Request):
data = await request.json()
prettyprinter.cpprint(data)
# Ensure the "recordings" folder exists using pathlib
recordings_folder = Path("recordings")
recordings_folder.mkdir(exist_ok=True)
# Determine the next file number by examining existing .json files
existing_numbers = []
for item in recordings_folder.iterdir():
if item.is_file() and item.suffix == ".json":
try:
file_num = int(item.stem)
existing_numbers.append(file_num)
except ValueError:
# In case the file name isn't just a number
pass
if existing_numbers:
next_number = max(existing_numbers) + 1
else:
next_number = 1
# Construct the file path
file_path = recordings_folder / f"{next_number}.json"
# Save the JSON data to the file
with file_path.open("w") as f:
json.dump(data, f, indent=2)
# Optionally save screenshot if needed
# if "website_screenshot" in data and data["website_screenshot"]:
# screenshot_folder = Path("screenshots")
# screenshot_folder.mkdir(exist_ok=True)
# b64_to_png(data["website_screenshot"], screenshot_folder / f"{next_number}.png")
return {"status": "ok", "message": f"Saved to {file_path}"}
if __name__ == "__main__":
print("Starting Browser-Use recording API on http://0.0.0.0:9000")
uvicorn.run(app, host="0.0.0.0", port=9000)
```
### Client Component (client.py)
The client component runs the Browser-Use agent with a recording hook:
```python
#!/usr/bin/env python3
#
# Client to record and save Browser-Use activity.
# Save this code to client.py and run with `python client.py`
#
import asyncio
import requests
from dotenv import load_dotenv
from pyobjtojson import obj_to_json
from browser_use.llm import ChatOpenAI
from browser_use import Agent
# Load environment variables (for API keys)
load_dotenv()
def send_agent_history_step(data):
"""Send the agent step data to the recording API"""
url = "http://127.0.0.1:9000/post_agent_history_step"
response = requests.post(url, json=data)
return response.json()
async def record_activity(agent_obj):
"""Hook function that captures and records agent activity at each step"""
website_html = None
website_screenshot = None
urls_json_last_elem = None
model_thoughts_last_elem = None
model_outputs_json_last_elem = None
model_actions_json_last_elem = None
extracted_content_json_last_elem = None
print('--- ON_STEP_START HOOK ---')
# Capture current page state
website_html = await agent_obj.browser_session.get_page_html()
website_screenshot = await agent_obj.browser_session.take_screenshot()
# Make sure we have state history
if hasattr(agent_obj, "state"):
history = agent_obj.state.history
else:
history = None
print("Warning: Agent has no state history")
return
# Process model thoughts
model_thoughts = obj_to_json(
obj=history.model_thoughts(),
check_circular=False
)
if len(model_thoughts) > 0:
model_thoughts_last_elem = model_thoughts[-1]
# Process model outputs
model_outputs = agent_obj.state.history.model_outputs()
model_outputs_json = obj_to_json(
obj=model_outputs,
check_circular=False
)
if len(model_outputs_json) > 0:
model_outputs_json_last_elem = model_outputs_json[-1]
# Process model actions
model_actions = agent_obj.state.history.model_actions()
model_actions_json = obj_to_json(
obj=model_actions,
check_circular=False
)
if len(model_actions_json) > 0:
model_actions_json_last_elem = model_actions_json[-1]
# Process extracted content
extracted_content = agent_obj.state.history.extracted_content()
extracted_content_json = obj_to_json(
obj=extracted_content,
check_circular=False
)
if len(extracted_content_json) > 0:
extracted_content_json_last_elem = extracted_content_json[-1]
# Process URLs
urls = agent_obj.state.history.urls()
urls_json = obj_to_json(
obj=urls,
check_circular=False
)
if len(urls_json) > 0:
urls_json_last_elem = urls_json[-1]
# Create a summary of all data for this step
model_step_summary = {
"website_html": website_html,
"website_screenshot": website_screenshot,
"url": urls_json_last_elem,
"model_thoughts": model_thoughts_last_elem,
"model_outputs": model_outputs_json_last_elem,
"model_actions": model_actions_json_last_elem,
"extracted_content": extracted_content_json_last_elem
}
print("--- MODEL STEP SUMMARY ---")
print(f"URL: {urls_json_last_elem}")
# Send data to the API
result = send_agent_history_step(data=model_step_summary)
print(f"Recording API response: {result}")
async def run_agent():
"""Run the Browser-Use agent with the recording hook"""
agent = Agent(
task="Compare the price of gpt-4o and DeepSeek-V3",
llm=ChatOpenAI(model="gpt-4.1-mini"),
)
try:
print("Starting Browser-Use agent with recording hook")
await agent.run(
on_step_start=record_activity,
max_steps=30
)
except Exception as e:
print(f"Error running agent: {e}")
if __name__ == "__main__":
# Check if API is running
try:
requests.get("http://127.0.0.1:9000")
print("Recording API is available")
except:
print("Warning: Recording API may not be running. Start api.py first.")
# Run the agent
asyncio.run(run_agent())
```
Contribution by Carlos A. Planchón.
### Working with the Recorded Data
After running the agent, you'll find the recorded data in the `recordings` directory. Here's how you can use this data:
1. **View recorded sessions**: Each JSON file contains a snapshot of agent activity for one step
2. **Extract screenshots**: You can modify the API to save screenshots separately
3. **Analyze agent behavior**: Use the recorded data to study how the agent navigates websites
### Extending the Example
You can extend this recording system in several ways:
1. **Save screenshots separately**: Uncomment the screenshot saving code in the API
2. **Add a web dashboard**: Create a simple web interface to view recorded sessions
3. **Add session IDs**: Modify the API to group steps by agent session
4. **Add filtering**: Implement filters to record only specific types of actions

View File

@@ -1,252 +0,0 @@
---
title: "MCP Client"
description: "Connect external MCP servers to extend browser-use with additional tools and integrations"
icon: "plug"
mode: "wide"
---
The MCP (Model Context Protocol) client allows browser-use agents to connect to external MCP servers, automatically exposing their tools as actions.
<Note>
MCP is an open protocol for integrating LLMs with external data sources and tools. Learn more at [modelcontextprotocol.io](https://modelcontextprotocol.io).
</Note>
<Info>
Looking to expose browser-use as an MCP server instead? See [MCP Server](/customize/mcp-server).
</Info>
## Installation
```bash
uv pip install "browser-use[cli]"
```
## Quick Start
```python
import os
from browser_use import Agent, Tools
from browser_use.mcp.client import MCPClient
# Create tools
tools = Tools()
# Connect to MCP server
mcp_client = MCPClient(
server_name="filesystem",
command="npx",
args=["@modelcontextprotocol/server-filesystem", "/path/to/files"]
)
# Connect and register
await mcp_client.connect()
await mcp_client.register_to_tools(tools)
# Agent can now use filesystem tools
agent = Agent(
task="Read the README.md file",
tools=tools
)
await agent.run()
# Clean up
await mcp_client.disconnect()
```
## API Reference
### MCPClient
```python
class MCPClient:
def __init__(
self,
server_name: str,
command: str,
args: list[str] | None = None,
env: dict[str, str] | None = None,
) -> None
```
**Parameters:**
- `server_name`: Name of the MCP server (for logging)
- `command`: Command to start the server (e.g., `"npx"`)
- `args`: Arguments for the command
- `env`: Environment variables for the server
**Key Methods:**
```python
# Connect to server
await mcp_client.connect()
# Register tools to tools
await mcp_client.register_to_tools(
tools,
tool_filter=['read_file', 'write_file'], # Optional
prefix='fs_' # Optional prefix
)
# Disconnect
await mcp_client.disconnect()
```
### Context Manager Usage
```python
async with MCPClient(
server_name="github",
command="npx",
args=["@modelcontextprotocol/server-github"],
env={"GITHUB_TOKEN": os.getenv("GITHUB_TOKEN")}
) as client:
await client.register_to_tools(tools)
await agent.run()
# Automatically disconnected
```
## Common MCP Servers
### Filesystem
```python
MCPClient(
server_name="filesystem",
command="npx",
args=["@modelcontextprotocol/server-filesystem", "/path"]
)
```
### PostgreSQL
```python
MCPClient(
server_name="postgres",
command="npx",
args=["@modelcontextprotocol/server-postgres", "postgresql://localhost/db"]
)
```
### GitHub
```python
MCPClient(
server_name="github",
command="npx",
args=["@modelcontextprotocol/server-github"],
env={"GITHUB_TOKEN": os.getenv("GITHUB_TOKEN")}
)
```
## Multiple Servers
Connect multiple servers with prefixes to avoid conflicts:
```python
# Filesystem server
fs_client = MCPClient(
server_name="filesystem",
command="npx",
args=["@modelcontextprotocol/server-filesystem", "."]
)
await fs_client.connect()
await fs_client.register_to_tools(tools, prefix="fs_")
# GitHub server
gh_client = MCPClient(
server_name="github",
command="npx",
args=["@modelcontextprotocol/server-github"],
env={"GITHUB_TOKEN": os.getenv("GITHUB_TOKEN")}
)
await gh_client.connect()
await gh_client.register_to_tools(tools, prefix="gh_")
# Agent can use both
agent = Agent(
task="Read README.md and create a GitHub issue",
tools=tools
)
await agent.run()
# Clean up
await fs_client.disconnect()
await gh_client.disconnect()
```
## Tool Filtering
Register only specific tools:
```python
await mcp_client.register_to_tools(
tools,
tool_filter=['read_file', 'list_directory']
)
```
## Custom MCP Server
Create your own MCP server:
```python
# my_server.py
import mcp.server.stdio
import mcp.types as types
from mcp.server import Server
server = Server("custom-tools")
@server.list_tools()
async def handle_list_tools() -> list[types.Tool]:
return [
types.Tool(
name="calculate",
description="Perform calculation",
inputSchema={
"type": "object",
"properties": {
"expression": {"type": "string"}
},
"required": ["expression"]
}
)
]
@server.call_tool()
async def handle_call_tool(name: str, arguments: dict) -> list[types.TextContent]:
if name == "calculate":
result = eval(arguments["expression"])
return [types.TextContent(type="text", text=str(result))]
return []
# Run server
async def main():
async with mcp.server.stdio.stdio_server() as (read, write):
await server.run(read, write, ...)
if __name__ == "__main__":
import asyncio
asyncio.run(main())
```
Connect custom server:
```python
custom_client = MCPClient(
server_name="custom",
command="python",
args=["my_server.py"]
)
```
## Best Practices
1. **Always disconnect** when done
2. **Use prefixes** when connecting multiple servers
3. **Filter tools** to limit capabilities
4. **Use context managers** for automatic cleanup
## See Also
- [MCP Server](/customize/mcp-server) - Expose browser-use as an MCP server
- [Custom Functions](/customize/custom-functions) - Write custom actions directly
- [Model Context Protocol](https://modelcontextprotocol.io) - MCP specification

View File

@@ -1,436 +0,0 @@
---
title: "MCP Server"
description: "Expose browser-use capabilities as an MCP server for AI assistants like Claude Desktop"
icon: "server"
mode: "wide"
---
The MCP server exposes browser-use's browser automation capabilities as tools that can be used by AI assistants like Claude Desktop. This allows external MCP clients to control browsers, navigate websites, extract content, and perform automated tasks.
<Note>
This is the opposite of the [MCP Client](/customize/mcp-client). The MCP client lets browser-use connect to external MCP servers, while this MCP server lets external AI assistants connect to browser-use.
</Note>
## Overview
The MCP server acts as a bridge between MCP-compatible AI assistants and browser-use:
```mermaid
graph LR
A[Claude Desktop] -->|MCP Protocol| B[Browser-use MCP Server]
B --> C[Browser]
B --> D[Tools]
B --> E[FileSystem]
C --> F[Playwright Browser]
style B fill:#f9f,stroke:#333,stroke-width:2px
```
## Installation
```bash
uv pip install "browser-use[cli]"
```
## Quick Start
### 1. Configure Claude Desktop
Add browser-use to your Claude Desktop configuration:
<Tabs>
<Tab title="macOS">
Edit `~/Library/Application Support/Claude/claude_desktop_config.json`:
```json
{
"mcpServers": {
"browser-use": {
"command": "uvx",
"args": ["browser-use[cli]", "--mcp"],
"env": {
"OPENAI_API_KEY": "sk-..." // Optional: for content extraction
}
}
}
}
```
</Tab>
<Tab title="Windows">
Edit `%APPDATA%\Claude\claude_desktop_config.json`:
```json
{
"mcpServers": {
"browser-use": {
"command": "uvx",
"args": ["browser-use[cli]", "--mcp"],
"env": {
"OPENAI_API_KEY": "sk-..." // Optional: for content extraction
}
}
}
}
```
</Tab>
</Tabs>
### 2. Restart Claude Desktop
The browser-use tools will appear in Claude's tools menu (🔌 icon).
### 3. Use Browser Automation
Ask Claude to perform browser tasks:
- "Navigate to example.com and describe what you see"
- "Search for 'browser automation' on Google"
- "Fill out the contact form on this website"
## API Reference
### Available Tools
The MCP server exposes the following tools to MCP clients:
#### Navigation Tools
##### `browser_navigate`
Navigate to a URL.
```typescript
browser_navigate(url: string, new_tab?: boolean): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `url` | `string` | Yes | URL to navigate to |
| `new_tab` | `boolean` | No | Open in new tab (default: false) |
**Returns:** Success message with URL
##### `browser_go_back`
Navigate back in browser history.
```typescript
browser_go_back(): string
```
**Returns:** "Navigated back"
#### Interaction Tools
##### `browser_click`
Click an element by index.
```typescript
browser_click(index: number, new_tab?: boolean): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `index` | `number` | Yes | Element index from browser state |
| `new_tab` | `boolean` | No | Open link in new tab (default: false) |
**Returns:** Success message indicating click action
**Note:** When `new_tab` is true:
- For links: Extracts href and opens in new tab
- For other elements: Uses Cmd/Ctrl+Click
##### `browser_type`
Type text into an input field.
```typescript
browser_type(index: number, text: string): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `index` | `number` | Yes | Element index from browser state |
| `text` | `string` | Yes | Text to type |
**Returns:** Success message with typed text
##### `browser_scroll`
Scroll the page.
```typescript
browser_scroll(direction?: "up" | "down"): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `direction` | `"up" \| "down"` | No | Scroll direction (default: "down") |
**Returns:** "Scrolled {direction}"
#### State & Content Tools
##### `browser_get_state`
Get current browser state with all interactive elements.
```typescript
browser_get_state(include_screenshot?: boolean): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `include_screenshot` | `boolean` | No | Include base64 screenshot (default: false) |
**Returns:** JSON string containing:
```json
{
"url": "current page URL",
"title": "page title",
"tabs": [{"url": "...", "title": "..."}],
"interactive_elements": [
{
"index": 0,
"tag": "button",
"text": "element text (max 100 chars)",
"placeholder": "if present",
"href": "if link"
}
],
"screenshot": "base64 if requested"
}
```
The interactive elements include all clickable and interactive elements on the page, with their:
- `index`: Used to reference the element in other commands (click, type)
- `tag`: HTML tag name (button, input, a, etc.)
- `text`: Visible text content, truncated to 100 characters
- `placeholder`: For input fields (if present)
- `href`: For links (if present)
##### `browser_extract_content`
Extract structured content from the current page using AI.
```typescript
browser_extract_content(query: string, extract_links?: boolean): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `query` | `string` | Yes | What to extract (e.g., "all product prices") |
| `extract_links` | `boolean` | No | Include links in extraction (default: false) |
**Returns:** Extracted content based on query
**Note:** Requires `OPENAI_API_KEY` environment variable for AI extraction.
#### Tab Management Tools
##### `browser_list_tabs`
List all open browser tabs.
```typescript
browser_list_tabs(): string
```
**Returns:** JSON array of tab information:
```json
[
{
"tab_id": 'AE21',
"url": "https://example.com",
"title": "Page Title"
}
]
```
##### `browser_switch_tab`
Switch to a specific tab.
```typescript
browser_switch_tab(tab_id: string): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `tab_id` | `string` | Yes | ID of tab to switch to (last 4 characters of TargetID) |
**Returns:** Success message with tab URL
##### `browser_close_tab`
Close a specific tab.
```typescript
browser_close_tab(tab_id: string): string
```
**Parameters:**
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `tab_id` | `string` | Yes | ID of the Tab to close (last 4 characters of TargetID) |
**Returns:** Success message with closed tab URL
### Tool Response Format
All tools return text content. Errors are returned as strings starting with "Error:".
## Configuration
### Environment Variables
Configure the MCP server behavior through environment variables in Claude Desktop config:
```json
{
"mcpServers": {
"browser-use": {
"command": "python",
"args": ["-m", "browser_use.mcp.server"],
"env": {
"OPENAI_API_KEY": "sk-..." // For AI content extraction
}
}
}
}
```
### Browser Profile Settings
The MCP server creates a browser session with these default settings:
- **Downloads Path**: `~/Downloads/browser-use-mcp/`
- **Wait Between Actions**: 0.5 seconds
- **Keep Alive**: True (browser stays open between commands)
- **Allowed Domains**: None by default (all domains allowed)
## Advanced Usage
### Running Standalone
Test the MCP server without Claude Desktop:
```bash
# Run server (reads from stdin, writes to stdout)
uvx 'browser-use[cli]' --mcp
# The server communicates via JSON-RPC on stdio
```
### Security Considerations
<Warning>
The MCP server provides full browser control to connected AI assistants. Consider these security measures:
</Warning>
1. **Domain Restrictions**: Currently not configurable via environment variables, but the server creates sessions with no domain restrictions by default
2. **File System Access**: The server creates a FileSystem instance at `~/.browser-use-mcp` for extraction operations
3. **Downloads**: Files download to `~/Downloads/browser-use-mcp/`
## Implementation Details
### Browser Session Management
- **Lazy Initialization**: Browser session is created on first browser tool use
- **Persistent Session**: Session remains active across multiple tool calls
- **Single Session**: Currently maintains one browser session per server instance
### Tool Categories
1. **Direct Browser Control**: Tools starting with `browser_` that directly interact with the browser
2. **Agent Tasks**: Currently commented out in implementation (`browser_use_run_task`)
### Error Handling
- All exceptions are caught and returned as text: `"Error: {message}"`
- Browser session initialization errors are returned to the client
- Missing dependencies (e.g., OPENAI_API_KEY) return descriptive error messages
## Troubleshooting
### Server Not Appearing in Claude
1. **Check configuration path:**
- macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
- Windows: `%APPDATA%\Claude\claude_desktop_config.json`
2. **Verify Python installation:**
```bash
uvx 'browser-use[cli]' --version
uvx 'browser-use[cli]' --mcp --help
```
3. **Check Claude logs:**
- macOS: `~/Library/Logs/Claude/mcp.log`
- Windows: `%APPDATA%\Claude\logs\mcp.log`
### Browser Not Launching
```bash
# Install Playwright browsers
playwright install chromium
# Test browser launch
python -c "from browser_use import Browser; import asyncio; asyncio.run(Browser().start())"
```
### Connection Errors
If you see "MCP server connection failed":
1. Test the server directly:
```bash
uvx 'browser-use[cli]' --mcp
```
2. Check all dependencies:
```bash
uv pip install "browser-use[cli]"
```
### Content Extraction Not Working
If `browser_extract_content` returns errors:
1. Ensure `OPENAI_API_KEY` is set in the environment configuration
2. Verify the API key is valid
3. Check that you have credits/access to the OpenAI API
## Limitations
| Limitation | Description | Workaround |
|------------|-------------|------------|
| Single Browser Session | One browser instance per server | Restart server for new session |
| No Domain Restrictions Config | Cannot configure allowed domains via env vars | Modify server code if needed |
| No Agent Mode | `browser_use_run_task` is commented out | Use direct browser control tools |
| Text-Only Responses | All responses are text strings | Parse JSON responses client-side |
## Comparison with MCP Client
| Feature | MCP Server (this) | [MCP Client](/customize/mcp-client) |
|---------|-------------------|-------------------------------------|
| **Purpose** | Expose browser to AI | Connect agent to tools |
| **User** | Claude Desktop, etc. | Browser-use agents |
| **Direction** | External → Browser | Agent → External |
| **Configuration** | JSON config file | Python code |
| **Tools** | Fixed browser tools | Dynamic from server |
| **Use Case** | Interactive assistance | Automated workflows |
## Code Examples
- [Simple MCP client example](https://github.com/browser-use/browser-use/tree/main/examples/mcp/simple_server.py) - Basic MCP client connecting to browser-use server
- [Advanced MCP client example](https://github.com/browser-use/browser-use/tree/main/examples/mcp/advanced_server.py) - Multi-server orchestration and complex workflows
## See Also
- [MCP Client](/customize/mcp-client) - Connect browser-use to external MCP servers
- [Model Context Protocol](https://modelcontextprotocol.io) - MCP specification
- [Claude Desktop](https://claude.ai/download) - Primary MCP client

View File

@@ -1,72 +0,0 @@
---
title: "Contribution Guide"
description: "Learn how to contribute to Browser Use"
icon: "github"
mode: "wide"
---
# Join the Browser Use Community!
We're thrilled you're interested in contributing to Browser Use! This guide will help you get started with contributing to our project. Your contributions are what make the open-source community such an amazing place to learn, inspire, and create.
## Quick Setup
Get started with Browser Use development in minutes:
```bash
git clone https://github.com/browser-use/browser-use
cd browser-use
uv sync --all-extras --dev
# or pip install -U git+https://github.com/browser-use/browser-use.git@main
echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env
```
For more detailed setup instructions, see our [Local Setup Guide](/development/local-setup).
## How to Contribute
### Find Something to Work On
- Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues) for beginner-friendly issues labeled `good-first-issue`
- Check out our most active issues or ask in [Discord](https://discord.gg/zXJJHtJf3k) for ideas of what to work on
- Get inspiration and share what you build in the [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel
- Explore or contribute to [`awesome-browser-use-prompts`](https://github.com/browser-use/awesome-prompts)!
### Making a Great Pull Request
When submitting a pull request, please:
- Include a clear description of what the PR does and why it's needed
- Add tests that cover your changes
- Include a demo screenshot/gif or an example script demonstrating your changes
- Make sure the PR passes all CI checks and tests
- Keep your PR focused on a single issue or feature to make it easier to review
Note: We appreciate quality over quantity. Instead of submitting small typo/style-only PRs, consider including those fixes as part of larger bugfix or feature PRs.
### Contribution Process
1. Fork the repository
2. Create a new branch for your feature or bugfix
3. Make your changes
4. Run tests to ensure everything works
5. Submit a pull request
6. Respond to any feedback from maintainers
7. Celebrate your contribution!
Feel free to bump your issues/PRs with comments periodically if you need faster feedback.
## Code of Conduct
We're committed to providing a welcoming and inclusive environment for all contributors. Please be respectful and constructive in all interactions.
## Getting Help
If you need help at any point:
- Join our [Discord community](https://link.browser-use.com/discord)
- Ask questions in the appropriate GitHub issue
- Check our [documentation](/introduction)
We're here to help you succeed in contributing to Browser Use!

View File

@@ -0,0 +1,11 @@
---
title: "Get Help"
description: "More than 20k developers help each other"
icon: "circle-question"
mode: "wide"
---
1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
2. Ask in our [Discord community](https://link.browser-use.com/discord)
3. Get support for your enterprise with support@browser-use.com

View File

@@ -1,160 +0,0 @@
---
title: "Local Setup"
description: "Set up Browser Use development environment locally"
icon: "laptop-code"
mode: "wide"
---
# Welcome to Browser Use Development!
We're excited to have you join our community of contributors. This guide will help you set up your local development environment quickly and easily.
## Quick Setup
If you're familiar with Python development, here's the quick way to get started:
```bash
git clone https://github.com/browser-use/browser-use
cd browser-use
uv sync --all-extras --dev
# or pip install -U git+https://github.com/browser-use/browser-use.git@main
echo "BROWSER_USE_LOGGING_LEVEL=debug" >> .env
```
## Helper Scripts
We provide several convenient shell scripts in the `bin/` directory to help with common development tasks:
```bash
# Complete setup script - installs uv, creates a venv, and installs dependencies
./bin/setup.sh
# Run all pre-commit hooks (formatting, linting, type checking)
./bin/lint.sh
# Run the core test suite that's executed in CI
./bin/test.sh
```
## Prerequisites
Browser Use requires Python 3.11 or higher. We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management.
## Detailed Setup Instructions
### Clone the Repository
First, clone the Browser Use repository:
```bash
git clone https://github.com/browser-use/browser-use
cd browser-use
```
### Environment Setup
1. Create and activate a virtual environment:
```bash
uv venv --python 3.11
source .venv/bin/activate
```
2. Install dependencies:
```bash
# Install the package in editable mode with all development dependencies
uv sync --all-extras
# Install the default browser
playwright install chromium --with-deps --no-shell
```
## Configuration
Set up your environment variables:
```bash
# Copy the example environment file
cp .env.example .env
```
Or manually create a `.env` file with the API key for the models you want to use set:
```bash .env
OPENAI_API_KEY=...
ANTHROPIC_API_KEY=
AZURE_ENDPOINT=
AZURE_OPENAI_API_KEY=
GOOGLE_API_KEY=
DEEPSEEK_API_KEY=
GROK_API_KEY=
NOVITA_API_KEY=
BROWSER_USE_LOGGING_LEVEL=debug # Helpful for development
```
<Note>
See [Supported Models](/customize/supported-models) for available LLM options
and their specific API key requirements.
</Note>
## Development
After setup, you can:
- Try demos in the example library with `uv run examples/simple.py`
- Run the linter/formatter with `uv run ruff format examples/some/file.py`
- Run tests with `uv run pytest`
- Build the package with `uv build`
### Linting
```bash
# Run the linter on the whole project (must pass for PR to be allowed to merge)
uv run pre-commit run --all-files
# or use our convenience script
./bin/lint.sh
# Install the linter & formatter pre-commit hooks to run automatically
pre-commit install --install-hooks
# Experimental: run the type checker
uv run type
```
### Tests
```bash
# Run all tests that run in CI
./bin/test.sh
# Run specific tests
uv run pytest # run everything
uv run pytest tests/test_tools.py # run a specific test file
uv run pytest tests/test_sensitive_data.py tests/test_tab_management.py # run two test files
uv run pytest tests/test_tab_management.py::TestTabManagement::test_user_changes_tab # run a single test
```
### Build
```bash
uv build
uv pip install dist/*.whl
# push build to PyPI (automatically run by Github Actions CI)
uv publish
```
## Getting Help
If you run into any issues:
1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
2. Join our [Discord community](https://link.browser-use.com/discord) for support
<Note>
We welcome contributions! See our [Contribution
Guide](/development/contribution-guide) for guidelines on how to help improve
Browser Use.
</Note>

View File

@@ -31,7 +31,7 @@ import asyncio
from lmnr import Laminar, Instruments
# this line auto-instruments Browser Use and any browser you use (local or remote)
Laminar.initialize(project_api_key="...")
Laminar.initialize(project_api_key="...", disabled_instruments={Instruments.BROWSER_USE})
async def main():
agent = Agent(

View File

@@ -0,0 +1,31 @@
---
title: "Telemetry"
description: "Understanding Browser Use's telemetry"
icon: "chart-mixed"
mode: "wide"
---
## Overview
Browser Use is free under the MIT license. To help us continue improving the library, we collect anonymous usage data with [PostHog](https://posthog.com) . This information helps us understand how the library is used, fix bugs more quickly, and prioritize new features.
## Opting Out
You can disable telemetry by setting the environment variable:
```bash .env
ANONYMIZED_TELEMETRY=false
```
Or in your Python code:
```python
import os
os.environ["ANONYMIZED_TELEMETRY"] = "false"
```
<Note>
Even when enabled, telemetry has zero impact on the library's performance. Code is available in [Telemetry
Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
</Note>

View File

@@ -0,0 +1,37 @@
---
title: "Contribution Guide"
description: ""
icon: "handshake"
mode: "wide"
---
## Mission
- Make developers happy
- Do more clicks than human
- Tell your computer what to do, and it gets it done.
- Make agents faster and more reliable.
## What to work on?
- This space is moving fast. We have 10 ideas daily. Let's exchange some.
- Browse our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
- Check out our most active issues on [Discord](https://discord.gg/zXJJHtJf3k)
- Get inspiration in [`#showcase-your-work`](https://discord.com/channels/1303749220842340412/1305549200678850642) channel
## What makes a great PR?
1. Why do we need this PR?
2. Include a demo screenshot/gif
3. Make sure the PR passes all CI tests
4. Keep your PR focused on a single feature
## How?
1. Fork the repository
2. Create a new branch for your feature
3. Submit a PR
We are overwhelmed with Issues. Feel free to bump your issues/PRs with comments periodically if you need faster feedback.

View File

@@ -0,0 +1,49 @@
---
title: "Local Setup"
description: "We're excited to have you join our community of contributors. "
icon: "laptop-code"
mode: "wide"
---
## Welcome to Browser Use Development!
```bash
git clone https://github.com/browser-use/browser-use
cd browser-use
uv sync --all-extras --dev
# or pip install -U git+https://github.com/browser-use/browser-use.git@main
```
## Configuration
Set up your environment variables:
```bash
# Copy the example environment file
cp .env.example .env
# set logging level
# BROWSER_USE_LOGGING_LEVEL=debug
```
## Helper Scripts
For common development tasks
```bash
# Complete setup script - installs uv, creates a venv, and installs dependencies
./bin/setup.sh
# Run all pre-commit hooks (formatting, linting, type checking)
./bin/lint.sh
# Run the core test suite that's executed in CI
./bin/test.sh
```
## Run examples
```bash
uv run examples/simple.py
```

View File

@@ -1,40 +0,0 @@
---
title: "Telemetry"
description: "Understanding Browser Use's telemetry and privacy settings"
icon: "chart-mixed"
mode: "wide"
---
## Overview
Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development.
## Data Collection
We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information.
<Note>
We never collect personal information, credentials, or specific content from
your browser automation tasks.
</Note>
## Opting Out
You can disable telemetry by setting an environment variable:
```bash .env
ANONYMIZED_TELEMETRY=false
```
Or in your Python code:
```python
import os
os.environ["ANONYMIZED_TELEMETRY"] = "false"
```
<Note>
Even when enabled, telemetry has zero impact on the library's performance or
functionality. Code is available in [Telemetry
Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
</Note>

View File

@@ -9,7 +9,10 @@
},
"favicon": "/favicon.ico",
"contextual": {
"options": ["copy", "view"]
"options": [
"copy",
"view"
]
},
"fonts": {
"family": "Geist"
@@ -42,11 +45,31 @@
},
{
"source": "/development/evaluations",
"destination": "/development/contribution-guide"
"destination": "/development/setup/contribution-guide"
},
{
"source": "/cli",
"destination": "/quickstart"
},
{
"source": "/development/local-setup",
"destination": "/development/setup/local-setup"
},
{
"source": "/development/contribution-guide",
"destination": "/development/setup/contribution-guide"
},
{
"source": "/development/telemetry",
"destination": "/development/monitoring/telemetry"
},
{
"source": "/development/observability",
"destination": "/development/monitoring/observability"
},
{
"source": "/development/hooks",
"destination": "/customize/hooks"
}
],
"navigation": {
@@ -56,7 +79,11 @@
"groups": [
{
"group": "Get Started",
"pages": ["introduction", "quickstart", "quickstart_llm"]
"pages": [
"introduction",
"quickstart",
"quickstart_llm"
]
},
{
"group": "Customize",
@@ -104,7 +131,8 @@
"customize/examples/parallel-browser",
"customize/examples/sensitive-data",
"customize/examples/secure",
"customize/examples/more-examples"
"customize/examples/more-examples",
"customize/examples/prompting-guide"
]
}
]
@@ -112,22 +140,40 @@
{
"group": "Development",
"pages": [
"development/contribution-guide",
"development/local-setup",
{
"group": "MCP",
"icon": "link",
"pages": ["customize/mcp-client", "customize/mcp-server"]
"group": "Contribution",
"icon": "github",
"isDefaultOpen": true,
"pages": [
"development/setup/local-setup",
"development/setup/contribution-guide"
]
},
"customize/hooks",
"development/telemetry",
"development/observability"
{
"group": "Advanced",
"icon": "gear",
"isDefaultOpen": false,
"pages": [
"customize/hooks"
]
},
{
"group": "Monitoring",
"icon": "chart-mixed",
"isDefaultOpen": false,
"pages": [
"development/monitoring/observability",
"development/monitoring/telemetry"
]
},
"development/get-help"
]
}
]
},
{
"tab": "Cloud",
"hidden": true,
"versions": [
{
"version": "v1",
@@ -155,27 +201,6 @@
"openapi": "https://api.browser-use.com/api/v1/openapi.json"
}
]
},
{
"version": "v2",
"groups": [
{
"group": "Get Started",
"pages": [
"cloud/v2/quickstart",
"cloud/v2/python-quickstart",
"cloud/v2/node-quickstart"
]
},
{
"group": "Platform",
"pages": [
"cloud/v1/pricing",
"cloud/v1/n8n-browser-use-integration",
"cloud/v1/search"
]
}
]
}
]
}
@@ -191,7 +216,11 @@
"display": "interactive"
},
"examples": {
"languages": ["javascript", "curl", "python"],
"languages": [
"javascript",
"curl",
"python"
],
"required": true
}
},

View File

@@ -20,9 +20,9 @@ icon: "book-open"
Open-source Python library.
</Card>
<Card
title="Cloud API"
title="Cloud Setup"
icon="cloud"
href="/cloud/v2/quickstart"
href="https://docs.cloud.browser-use.com"
color="#FE750E"
>
Scale up with our cloud.

View File

@@ -9,13 +9,13 @@ icon: "rocket"
<Tabs>
<Tab title="uv">
```bash create environment
```bash create environment
uv venv --python 3.12
```
</Tab>
<Tab title="pip">
```bash create environment
python -m venv .venv
```bash create environment with python >= 3.11
python3.12 -m venv .venv
```
</Tab>
</Tabs>
@@ -43,7 +43,7 @@ uvx playwright install chromium --with-deps
<Tab title="pip">
```bash install browser-use & chromium
pip install browser-use
playwright install chromium --with-deps
pip install playwright && playwright install chromium --with-deps
```
</Tab>
</Tabs>

View File

@@ -6,5 +6,5 @@ icon: "brain"
1. Copy all content [🔗 from here](https://docs.browser-use.com/llms-full.txt) (~40k tokens)
1. Copy all content [🔗 from here](https://docs.browser-use.com/llms-full.txt) (~32k tokens)
2. Paste it into your favorite coding agent (Cursor, Claude, ChatGPT ...).

View File

@@ -1,5 +1,3 @@
import asyncio
import logging
import os
import sys
@@ -9,58 +7,26 @@ from dotenv import load_dotenv
load_dotenv()
import pyotp # type: ignore
from browser_use import ActionResult, Agent, ChatOpenAI, Tools
from browser_use import Agent
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
secret_key = os.environ.get('OTP_SECRET_KEY')
if not secret_key:
# For this example copy the code from the website https://authenticationtest.com/totpChallenge/
# For real 2fa just copy the secret key when you setup 2fa, you can get this e.g. in 1Password
secret_key = 'JBSWY3DPEHPK3PXP'
tools = Tools()
sensitive_data = {'bu_2fa_code': secret_key}
@tools.registry.action('Get 2FA code from when OTP is required')
async def get_otp_2fa() -> ActionResult:
"""
Custom action to retrieve 2FA/MFA code from OTP secret key using pyotp.
The OTP secret key should be set in the environment variable OTP_SECRET_KEY.
"""
secret_key = os.environ.get('OTP_SECRET_KEY')
if not secret_key:
raise ValueError('OTP_SECRET_KEY environment variable is not set')
task = """
1. Go to https://authenticationtest.com/totpChallenge/ and try to log in.
2. If prompted for 2FA code:
Input the the secret bu_2fa_code.
totp = pyotp.TOTP(secret_key, digits=6)
code = totp.now()
return ActionResult(extracted_content=code)
When you input bu_2fa_code, the 6 digit code will be generated automatically.
"""
async def main():
# Example task using the 1Password 2FA action
task = """
Steps:
1. Go to https://authenticationtest.com/totpChallenge/ and try to log in.
2. If prompted for 2FA code:
2.1. Use the get_2fa_code action to retrieve the 2FA code.
2.2. Submit the code provided by the get_2fa_code action.
Considerations:
- ALWAYS use the get_2fa_code action to retrieve the 2FA code if needed.
- NEVER skip the 2FA step if the page requires it.
- NEVER extract the code from the page.
- NEVER use a code that is not generated by the get_2fa_code action.
- NEVER hallucinate the 2FA code, always use the get_2fa_code action to get it.
You are completely FORBIDDEN to use any other method to get the 2FA code.
"""
model = ChatOpenAI(model='gpt-4.1-mini')
agent = Agent(task=task, llm=model, tools=tools)
result = await agent.run()
print(f'Task completed with result: {result}')
if __name__ == '__main__':
asyncio.run(main())
Agent(task=task, sensitive_data=sensitive_data).run_sync() # type: ignore

View File

@@ -28,13 +28,6 @@ from browser_use import Agent, ChatOpenAI, Tools
from browser_use.agent.views import ActionResult
from browser_use.browser import BrowserSession
try:
from lmnr import Laminar
Laminar.initialize(project_api_key=os.getenv('LMNR_PROJECT_API_KEY'))
except ImportError:
pass
class OpenAICUAAction(BaseModel):
"""Parameters for OpenAI Computer Use Assistant action."""

View File

@@ -0,0 +1,113 @@
"""
Show how to use sample_images to add image context for your task
"""
import asyncio
import base64
from pathlib import Path
from typing import Any
from dotenv import load_dotenv
from browser_use import Agent
from browser_use.llm import ChatOpenAI
from browser_use.llm.messages import ContentPartImageParam, ContentPartTextParam, ImageURL
# Load environment variables
load_dotenv()
def image_to_base64(image_path: str) -> str:
"""
Convert image file to base64 string.
Args:
image_path: Path to the image file
Returns:
Base64 encoded string of the image
Raises:
FileNotFoundError: If image file doesn't exist
IOError: If image file cannot be read
"""
image_file = Path(image_path)
if not image_file.exists():
raise FileNotFoundError(f'Image file not found: {image_path}')
try:
with open(image_file, 'rb') as f:
encoded_string = base64.b64encode(f.read())
return encoded_string.decode('utf-8')
except OSError as e:
raise OSError(f'Failed to read image file: {e}')
def create_sample_images() -> list[ContentPartTextParam | ContentPartImageParam]:
"""
Create image context for the agent.
Returns:
list of content parts containing text and image data
"""
# Image path - replace with your actual image path
image_path = 'sample_image.png'
# Image context configuration
image_context: list[dict[str, Any]] = [
{
'type': 'text',
'value': (
'The following image explains the google layout. '
'The image highlights several buttons with red boxes, '
'and next to them are corresponding labels in red text.\n'
'Each label corresponds to a button as follows:\n'
'Label 1 is the "image" button.'
),
},
{'type': 'image', 'value': image_to_base64(image_path)},
]
# Convert to content parts
content_parts = []
for item in image_context:
if item['type'] == 'text':
content_parts.append(ContentPartTextParam(text=item['value']))
elif item['type'] == 'image':
content_parts.append(
ContentPartImageParam(
image_url=ImageURL(
url=f'data:image/png;base64,{item["value"]}',
media_type='image/png',
),
)
)
return content_parts
async def main() -> None:
"""
Main function to run the browser agent with image context.
"""
# Task configuration
task_str = 'goto https://www.google.com/ and click image button'
# Initialize the language model
model = ChatOpenAI(model='gpt-4.1')
# Create sample images for context
try:
sample_images = create_sample_images()
except (FileNotFoundError, OSError) as e:
print(f'Error loading sample images: {e}')
print('Continuing without sample images...')
sample_images = []
# Initialize and run the agent
agent = Agent(task=task_str, llm=model, sample_images=sample_images)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -1,6 +1,7 @@
import asyncio
import os
import sys
from pathlib import Path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@@ -16,7 +17,7 @@ browser_session = BrowserSession(
browser_profile=BrowserProfile(
keep_alive=True,
headless=False,
record_video_dir='./tmp/recordings',
record_video_dir=Path('./tmp/recordings'),
user_data_dir='~/.config/browseruse/profiles/default',
)
)

View File

@@ -69,11 +69,11 @@ browser_profile = BrowserProfile(allowed_domains=['*google.com', 'browser-use.co
# Sensitive data (optional) - {key: sensitive_information} - we filter out the sensitive_information from any input to the LLM, it will only work with placeholder.
# By default we pass screenshots to the LLM which can contain your information. Set use_vision=False to disable this.
# If you trust your LLM endpoint, you don't need to worry about this.
sensitive_data: dict[str, str | dict[str, str]] = {'company_name': 'browser-use'}
sensitive_data = {'company_name': 'browser-use'}
# Create Agent
agent = Agent(task=task, llm=llm, browser_profile=browser_profile, sensitive_data=sensitive_data)
agent = Agent(task=task, llm=llm, browser_profile=browser_profile, sensitive_data=sensitive_data) # type: ignore
async def main():

View File

@@ -25,13 +25,14 @@ company_credentials = {'company_username': 'user@example.com', 'company_password
# Map the same credentials to multiple domains for secure access control
# Type annotation to satisfy pyright
sensitive_data: dict[str, str | dict[str, str]] = {
sensitive_data = {
'https://example.com': company_credentials,
'https://admin.example.com': company_credentials,
'https://*.example-staging.com': company_credentials,
'http*://test.example.com': company_credentials,
# You can also add domain-specific credentials
'https://*.google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
# # You can also add domain-specific credentials
# 'https://google.com': {'g_email': 'user@gmail.com', 'g_pass': 'google_password'},
'this_email_works_on_all_domains': 'test@test.com',
}
# Update task to use one of the credentials above
task = 'Go to google.com and put the login information in the search bar.'

View File

@@ -0,0 +1,25 @@
import asyncio
from pathlib import Path
from browser_use import Agent, Browser, ChatOpenAI
# NOTE: To use this example, install imageio[ffmpeg], e.g. with uv pip install "browser-use[video]"
async def main():
browser_session = Browser(record_video_dir=Path('./tmp/recordings'))
agent = Agent(
task='Go to github.com/trending then navigate to the first trending repository and report how many commits it has.',
llm=ChatOpenAI(model='gpt-4.1-mini'),
browser_session=browser_session,
)
await agent.run(max_steps=5)
# The video will be saved automatically when the agent finishes and the session closes.
print('Agent run finished. Check the ./tmp/recordings directory for the video.')
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -0,0 +1,42 @@
import asyncio
import os
import sys
from agentmail import AsyncAgentMail # type: ignore
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
load_dotenv()
from browser_use import Agent, Browser, models
from examples.integrations.agentmail.email_tools import EmailTools
TASK = """
Go to reddit.com, create a new account (use the get_email_address), make up password and all other information, confirm the 2fa with get_latest_email, and like latest post on r/elon subreddit.
"""
async def main():
# Create email inbox
# Get an API key from https://agentmail.to/
email_client = AsyncAgentMail()
inbox = await email_client.inboxes.create()
print(f'Your email address is: {inbox.inbox_id}\n\n')
# Initialize the tools for browser-use agent
tools = EmailTools(email_client=email_client, inbox=inbox)
# Initialize the LLM for browser-use agent
llm = models.azure_gpt_4_1_mini
# Set your local browser path
browser = Browser(executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome')
agent = Agent(task=TASK, tools=tools, llm=llm, browser=browser)
await agent.run()
if __name__ == '__main__':
asyncio.run(main())

View File

@@ -0,0 +1,187 @@
"""
Email management to enable 2fa.
"""
import asyncio
import logging
# run `pip install agentmail` to install the library
from agentmail import AsyncAgentMail, Message, MessageReceivedEvent, Subscribe # type: ignore
from agentmail.inboxes.types.inbox import Inbox # type: ignore
from agentmail.inboxes.types.inbox_id import InboxId # type: ignore
from browser_use import Tools
# Configure basic logging if not already configured
if not logging.getLogger().handlers:
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(name)s - %(message)s')
logger = logging.getLogger(__name__)
class EmailTools(Tools):
def __init__(
self,
email_client: AsyncAgentMail | None = None,
email_timeout: int = 30,
inbox: Inbox | None = None,
):
super().__init__()
self.email_client = email_client or AsyncAgentMail()
self.email_timeout = email_timeout
self.register_email_tools()
self.inbox: Inbox | None = inbox
def _serialize_message_for_llm(self, message: Message) -> str:
"""
Serialize a message for the LLM
"""
# Use text if available, otherwise convert HTML to simple text
body_content = message.text
if not body_content and message.html:
body_content = self._html_to_text(message.html)
msg = f'From: {message.from_}\nTo: {message.to}\nTimestamp: {message.timestamp.isoformat()}\nSubject: {message.subject}\nBody: {body_content}'
return msg
def _html_to_text(self, html: str) -> str:
"""
Simple HTML to text conversion
"""
import re
# Remove script and style elements - handle spaces in closing tags
html = re.sub(r'<script\b[^>]*>.*?</script\s*>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style\b[^>]*>.*?</style\s*>', '', html, flags=re.DOTALL | re.IGNORECASE)
# Remove HTML tags
html = re.sub(r'<[^>]+>', '', html)
# Decode HTML entities
html = html.replace('&nbsp;', ' ')
html = html.replace('&amp;', '&')
html = html.replace('&lt;', '<')
html = html.replace('&gt;', '>')
html = html.replace('&quot;', '"')
html = html.replace('&#39;', "'")
# Clean up whitespace
html = re.sub(r'\s+', ' ', html)
html = html.strip()
return html
async def get_or_create_inbox_client(self) -> Inbox:
"""
Create a default inbox profile for this API key (assume that agent is on free tier)
If you are not on free tier it is recommended to create 1 inbox per agent.
"""
if self.inbox:
return self.inbox
return await self.create_inbox_client()
async def create_inbox_client(self) -> Inbox:
"""
Create a default inbox profile for this API key (assume that agent is on free tier)
If you are not on free tier it is recommended to create 1 inbox per agent.
"""
inbox = await self.email_client.inboxes.create()
self.inbox = inbox
return inbox
async def wait_for_message(self, inbox_id: InboxId) -> Message:
"""
Wait for a message to be received in the inbox
"""
async with self.email_client.websockets.connect() as ws:
await ws.send_subscribe(message=Subscribe(inbox_ids=[inbox_id]))
try:
while True:
data = await asyncio.wait_for(ws.recv(), timeout=self.email_timeout)
if isinstance(data, MessageReceivedEvent):
await self.email_client.inboxes.messages.update(
inbox_id=inbox_id, message_id=data.message.message_id, remove_labels=['unread']
)
msg = data.message
logger.info(f'Received new message from: {msg.from_} with subject: {msg.subject}')
return msg
# If not MessageReceived, continue waiting for the next event
except TimeoutError:
raise TimeoutError(f'No email received in the inbox in {self.email_timeout}s')
def register_email_tools(self):
"""Register all email-related controller actions"""
@self.action('Get email address for login. You can use this email to login to any service with email and password')
async def get_email_address() -> str:
"""
Get the email address of the inbox
"""
inbox = await self.get_or_create_inbox_client()
logger.info(f'Email address: {inbox.inbox_id}')
return inbox.inbox_id
@self.action(
'Get the latest unread email from the inbox from the last max_age_minutes (default 5 minutes). Waits some seconds for new emails if none found. Use for 2FA codes.'
)
async def get_latest_email(max_age_minutes: int = 5) -> str:
"""
1. Check for unread emails within the last max_age_minutes
2. If no recent unread email, wait 30 seconds for new email via websocket
"""
from datetime import datetime, timedelta, timezone
inbox = await self.get_or_create_inbox_client()
# Get unread emails
emails = await self.email_client.inboxes.messages.list(inbox_id=inbox.inbox_id, labels=['unread'])
# Filter unread emails by time window - use UTC timezone to match email timestamps
time_cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age_minutes)
logger.debug(f'Time cutoff: {time_cutoff}')
logger.info(f'Found {len(emails.messages)} unread emails for inbox {inbox.inbox_id}')
recent_unread_emails = []
for i, email_summary in enumerate(emails.messages):
# Get full email details to check timestamp
full_email = await self.email_client.inboxes.messages.get(
inbox_id=inbox.inbox_id, message_id=email_summary.message_id
)
# Handle timezone comparison properly
email_timestamp = full_email.timestamp
if email_timestamp.tzinfo is None:
# If email timestamp is naive, assume UTC
email_timestamp = email_timestamp.replace(tzinfo=timezone.utc)
if email_timestamp >= time_cutoff:
recent_unread_emails.append(full_email)
# If we have recent unread emails, return the latest one
if recent_unread_emails:
# Sort by timestamp and get the most recent
recent_unread_emails.sort(key=lambda x: x.timestamp, reverse=True)
logger.info(f'Found {len(recent_unread_emails)} recent unread emails for inbox {inbox.inbox_id}')
latest_email = recent_unread_emails[0]
# Mark as read
await self.email_client.inboxes.messages.update(
inbox_id=inbox.inbox_id, message_id=latest_email.message_id, remove_labels=['unread']
)
logger.info(f'Latest email from: {latest_email.from_} with subject: {latest_email.subject}')
return self._serialize_message_for_llm(latest_email)
else:
logger.info('No recent unread emails, waiting for a new one')
# No recent unread emails, wait for new one
try:
latest_message = await self.wait_for_message(inbox_id=inbox.inbox_id)
except TimeoutError:
return f'No email received in the inbox in {self.email_timeout}s'
# logger.info(f'Latest message: {latest_message}')
return self._serialize_message_for_llm(latest_message)

View File

@@ -67,12 +67,10 @@ class GmailGrantManager:
with open(self.credentials_file) as f:
creds = json.load(f)
required_fields = ['web']
web = creds['web']
if not web:
return False, "Invalid credentials format - missing 'web' section"
return True, 'Credentials file is valid'
# Accept if either 'web' or 'installed' section exists and is not empty
if creds.get('web') or creds.get('installed'):
return True, 'Credentials file is valid'
return False, "Invalid credentials format - neither 'web' nor 'installed' sections found"
except json.JSONDecodeError:
return False, 'Credentials file is not valid JSON'

View File

@@ -14,13 +14,9 @@ Requirements:
import asyncio
from lmnr import Laminar
from browser_use import Agent
from browser_use.llm import ChatAnthropicBedrock, ChatAWSBedrock
Laminar.initialize()
async def example_anthropic_bedrock():
"""Example using ChatAnthropicBedrock - convenience class for Claude models."""

View File

@@ -10,10 +10,8 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
from lmnr import Laminar
load_dotenv()
Laminar.initialize()
from browser_use import Agent
from browser_use.llm import ChatAnthropic

View File

@@ -5,15 +5,11 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from dotenv import load_dotenv
from lmnr import Laminar
load_dotenv()
Laminar.initialize()
from browser_use import Agent, ChatGoogle
load_dotenv()
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError('GOOGLE_API_KEY is not set')

View File

@@ -7,15 +7,11 @@ Simple try of the agent.
import asyncio
from dotenv import load_dotenv
from lmnr import Laminar
from browser_use import Agent, ChatOpenAI
load_dotenv()
Laminar.initialize()
# All the models are type safe from OpenAI in case you need a list of supported models
llm = ChatOpenAI(model='gpt-4.1-mini')
agent = Agent(

View File

@@ -7,15 +7,11 @@ Simple try of the agent.
import asyncio
from dotenv import load_dotenv
from lmnr import Laminar
from browser_use import Agent, ChatOpenAI
load_dotenv()
Laminar.initialize()
# All the models are type safe from OpenAI in case you need a list of supported models
llm = ChatOpenAI(model='gpt-5-mini')
agent = Agent(

View File

@@ -12,13 +12,10 @@ This example demonstrates how to:
import asyncio
from langchain_openai import ChatOpenAI # pyright: ignore
from lmnr import Laminar
from browser_use import Agent
from examples.models.langchain.chat import ChatLangchain
Laminar.initialize()
async def main():
"""Basic example using ChatLangchain with OpenAI through LangChain."""

View File

@@ -0,0 +1,6 @@
from browser_use import Agent, models
# available providers for this import style: openai, azure, google
agent = Agent(task='Find founders of browser-use', llm=models.azure_gpt_4_1_mini)
agent.run_sync()

View File

@@ -5,14 +5,10 @@ import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
from lmnr import Laminar
load_dotenv()
Laminar.initialize()
from browser_use import Agent
from browser_use.llm import ChatGroq

View File

@@ -8,15 +8,11 @@ import asyncio
import os
from dotenv import load_dotenv
from lmnr import Laminar
from browser_use import Agent, ChatOpenAI
load_dotenv()
Laminar.initialize()
# All the models are type safe from OpenAI in case you need a list of supported models
llm = ChatOpenAI(
model='x-ai/grok-4',

View File

@@ -2,7 +2,7 @@
name = "browser-use"
description = "Make websites accessible for AI agents"
authors = [{ name = "Gregor Zunic" }]
version = "0.7.1"
version = "0.7.3"
readme = "README.md"
requires-python = ">=3.11,<4.0"
classifiers = [
@@ -28,9 +28,9 @@ dependencies = [
"typing-extensions>=4.12.2",
"uuid7>=0.1.0",
"authlib>=1.6.0",
"google-genai==1.29.0",
"openai==1.99.2",
"anthropic==0.58.2",
"google-genai>=1.29.0,<2.0.0",
"openai>=1.99.2,<2.0.0",
"anthropic>=0.58.2,<1.0.0",
"groq>=0.30.0",
"ollama>=0.5.1",
"google-api-python-client>=2.174.0",
@@ -38,9 +38,11 @@ dependencies = [
"google-auth-oauthlib>=1.2.2",
"mcp>=1.10.1",
"pypdf>=5.7.0",
"reportlab>=4.0.0",
"cdp-use>=1.4.0",
"markdown-pdf==1.5",
"pyotp>=2.9.0",
"html2text>=2025.4.15",
"pillow>=11.2.1",
]
# google-api-core: only used for Google LLM APIs
# pyperclip: only used for examples that use copy/paste
@@ -61,7 +63,12 @@ cli = [
aws = [
"boto3>=1.38.45"
]
video = [
"imageio[ffmpeg]>=2.37.0",
"numpy>=2.3.2",
]
examples = [
"agentmail>=0.0.53",
# botocore: only needed for Bedrock Claude boto3 examples/models/bedrock_claude.py
"botocore>=1.37.23",
"imgcat>=0.6.0",
@@ -70,9 +77,8 @@ examples = [
"langchain-openai>=0.3.26",
]
eval = [
"lmnr[all]==0.7.6",
"lmnr[all]==0.7.10",
"anyio>=4.9.0",
"Pillow>=11.2.1",
"psutil>=7.0.0",
"datamodel-code-generator>=0.26.0",
"hyperbrowser==0.47.0",
@@ -195,8 +201,8 @@ dev-dependencies = [
"pyright>=1.1.403",
"ty>=0.0.1a1",
"pytest-xdist>=3.7.0",
"pillow>=11.2.1",
"lmnr[all]==0.7.6",
"lmnr[all]==0.7.10",
# "pytest-playwright-asyncio>=0.7.0", # not actually needed I think
"pytest-timeout>=2.4.0",
"pydantic_settings>=2.10.1"
]

BIN
static/NiceHack69.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

View File

@@ -0,0 +1,10 @@
name: Google Maps 3d Screenshot
task: Go to google.com/maps and search for ETH Zurich Hauptgebäude. When found, close the side panel to see the map full screen. Then, if not already in Satellite View, switch to Satellite View. With Satellite View enabled, click the 3d icon to enable 3d view. Pan the map so that ETH Zurich Hauptgebäude and the Zurich Lake in the background are clearly visible. If able, take a screenshot.
judge_context:
- Agent must only use www.google.com/maps
- Agent should correctly search for ETH Zurich Hauptgebäude
- Agent should close the side panel
- After the agent performing the task the map should be visible in Satellite view, if it was not already
- The Agent should correctly click the correct 3d button to enable 3d mode
- The Agent should correctly pan the map so that ETH Zurich Hauptgebäude as well as the Zurich Lake in the background should be visible
max_steps: 25

View File

@@ -51,7 +51,14 @@ class TestUrlAllowlistSecurity:
# Test more complex glob patterns
browser_profile = BrowserProfile(
allowed_domains=['*.google.com', 'https://wiki.org', 'https://good.com', 'chrome://version', 'brave://*'],
allowed_domains=[
'*.google.com',
'https://wiki.org',
'https://good.com',
'https://*.test.com',
'chrome://version',
'brave://*',
],
headless=True,
user_data_dir=None,
)
@@ -90,6 +97,10 @@ class TestUrlAllowlistSecurity:
assert watchdog._is_url_allowed('https://sub.example.com%20@malicious.org') is False
assert watchdog._is_url_allowed('https://anygoogle.com@evil.org') is False
# Test pattern matching
assert watchdog._is_url_allowed('https://www.test.com') is True
assert watchdog._is_url_allowed('https://www.testx.com') is False
def test_glob_pattern_edge_cases(self):
"""Test edge cases for glob pattern matching to ensure proper behavior."""
from bubus import EventBus

View File

@@ -10,8 +10,11 @@ The serialization shows radio buttons as:
Usage:
uv run pytest tests/ci/test_radio_buttons.py -v -s
Note: This test requires a real LLM API key and is skipped in CI environments.
"""
import os
from pathlib import Path
import pytest
@@ -64,6 +67,10 @@ async def browser_session():
await browser_session.kill()
@pytest.mark.skipif(
os.getenv('CI') == 'true' or os.getenv('GITHUB_ACTIONS') == 'true',
reason='Skipped in CI: requires real LLM API key which blocks other tests',
)
class TestRadioButtons:
"""Test cases for radio button interactions."""

View File

@@ -164,7 +164,7 @@ class TestToolsIntegration:
assert schema['properties']['seconds']['default'] == 3
# Create wait action for 1 second - fix to use a dictionary
wait_action = {'wait': {'seconds': 1}} # Corrected format
wait_action = {'wait': {'seconds': 3}} # Corrected format
class WaitActionModel(ActionModel):
wait: dict | None = None
@@ -184,7 +184,7 @@ class TestToolsIntegration:
assert 'Waited for' in result.extracted_content or 'Waiting for' in result.extracted_content
# Verify that approximately 1 second has passed (allowing some margin)
assert 0.8 <= end_time - start_time <= 1.5 # Allow some timing margin for 1 second wait
assert end_time - start_time <= 0.5 # We wait 3-3 seconds for LLM call
# longer wait
# Create wait action for 1 second - fix to use a dictionary
@@ -204,9 +204,7 @@ class TestToolsIntegration:
assert result.extracted_content is not None
assert 'Waited for' in result.extracted_content or 'Waiting for' in result.extracted_content
# Verify that approximately 5 seconds have passed (allowing some margin)
assert 4.5 <= end_time - start_time <= 6.0 # Allow some timing margin for 5 second wait
assert end_time - start_time >= 1.9 # Allow some timing margin
assert 1.5 <= end_time - start_time <= 2.5 # We wait 5-3 seconds for LLM call
async def test_go_back_action(self, tools, browser_session, base_url):
"""Test that go_back action navigates to the previous page."""

View File

@@ -0,0 +1,161 @@
"""
Simplified tests for URL shortening functionality in Agent service.
Three focused tests:
1. Input message processing with URL shortening
2. Output processing with custom actions and URL restoration
3. End-to-end pipeline test
"""
import json
import pytest
from browser_use.agent.service import Agent
from browser_use.agent.views import AgentOutput
from browser_use.llm.messages import AssistantMessage, BaseMessage, UserMessage
# Super long URL to reuse across tests - much longer than the 25 character limit
# Includes both query params (?...) and fragment params (#...)
SUPER_LONG_URL = 'https://documentation.example-company.com/api/v3/enterprise/user-management/endpoints/administration/create-new-user-account-with-permissions/advanced-settings?format=detailed-json&version=3.2.1&timestamp=1699123456789&session_id=abc123def456ghi789&authentication_token=very_long_authentication_token_string_here&include_metadata=true&expand_relationships=user_groups,permissions,roles&sort_by=created_at&order=desc&page_size=100&include_deprecated_fields=false&api_key=super_long_api_key_that_exceeds_normal_limits#section=user_management&tab=advanced&view=detailed&scroll_to=permissions_table&highlight=admin_settings&filter=active_users&expand_all=true&debug_mode=enabled'
@pytest.fixture
def agent():
"""Create an agent instance for testing URL shortening functionality."""
from tests.ci.conftest import create_mock_llm
return Agent(task='Test URL shortening', llm=create_mock_llm(), url_shortening_limit=25)
class TestUrlShorteningInputProcessing:
"""Test URL shortening for input messages."""
def test_process_input_messages_with_url_shortening(self, agent: Agent):
"""Test that long URLs in input messages are shortened and mappings stored."""
original_content = f'Please visit {SUPER_LONG_URL} and extract information'
messages: list[BaseMessage] = [UserMessage(content=original_content)]
# Process messages (modifies messages in-place and returns URL mappings)
url_mappings = agent._process_messsages_and_replace_long_urls_shorter_ones(messages)
# Verify URL was shortened in the message (modified in-place)
processed_content = messages[0].content or ''
assert processed_content != original_content
assert 'https://documentation.example-company.com' in processed_content
assert len(processed_content) < len(original_content)
# Verify URL mapping was returned
assert len(url_mappings) == 1
shortened_url = next(iter(url_mappings.keys()))
assert url_mappings[shortened_url] == SUPER_LONG_URL
def test_process_user_and_assistant_messages_with_url_shortening(self, agent: Agent):
"""Test URL shortening in both UserMessage and AssistantMessage."""
user_content = f'I need to access {SUPER_LONG_URL} for the API documentation'
assistant_content = f'I will help you navigate to {SUPER_LONG_URL} to retrieve the documentation'
messages: list[BaseMessage] = [UserMessage(content=user_content), AssistantMessage(content=assistant_content)]
# Process messages (modifies messages in-place and returns URL mappings)
url_mappings = agent._process_messsages_and_replace_long_urls_shorter_ones(messages)
# Verify URL was shortened in both messages
user_processed_content = messages[0].content or ''
assistant_processed_content = messages[1].content or ''
assert user_processed_content != user_content
assert assistant_processed_content != assistant_content
assert 'https://documentation.example-company.com' in user_processed_content
assert 'https://documentation.example-company.com' in assistant_processed_content
assert len(user_processed_content) < len(user_content)
assert len(assistant_processed_content) < len(assistant_content)
# Verify URL mapping was returned (should be same shortened URL for both occurrences)
assert len(url_mappings) == 1
shortened_url = next(iter(url_mappings.keys()))
assert url_mappings[shortened_url] == SUPER_LONG_URL
class TestUrlShorteningOutputProcessing:
"""Test URL restoration for output processing with custom actions."""
def test_process_output_with_custom_actions_and_url_restoration(self, agent: Agent):
"""Test that shortened URLs in AgentOutput with custom actions are restored."""
# Set up URL mapping (simulating previous shortening)
shortened_url: str = agent._replace_urls_in_text(SUPER_LONG_URL)[0]
url_mappings = {shortened_url: SUPER_LONG_URL}
# Create AgentOutput with shortened URLs using JSON parsing
output_json = {
'thinking': f'I need to navigate to {shortened_url} for documentation',
'evaluation_previous_goal': 'Successfully processed the request',
'memory': f'Found useful info at {shortened_url}',
'next_goal': 'Complete the documentation review',
'action': [{'go_to_url': {'url': shortened_url, 'new_tab': False}}],
}
# Create properly typed AgentOutput with custom actions
tools = agent.tools
ActionModel = tools.registry.create_action_model()
AgentOutputWithActions = AgentOutput.type_with_custom_actions(ActionModel)
agent_output = AgentOutputWithActions.model_validate_json(json.dumps(output_json))
# Process the output to restore URLs (modifies agent_output in-place)
agent._recursive_process_all_strings_inside_pydantic_model(agent_output, url_mappings)
# Verify URLs were restored in all locations
assert SUPER_LONG_URL in (agent_output.thinking or '')
assert SUPER_LONG_URL in (agent_output.memory or '')
action_data = agent_output.action[0].model_dump()
assert action_data['go_to_url']['url'] == SUPER_LONG_URL
class TestUrlShorteningEndToEnd:
"""Test complete URL shortening pipeline end-to-end."""
def test_complete_url_shortening_pipeline(self, agent: Agent):
"""Test the complete pipeline: input shortening -> processing -> output restoration."""
# Step 1: Input processing with URL shortening
original_content = f'Navigate to {SUPER_LONG_URL} and extract the API documentation'
messages: list[BaseMessage] = [UserMessage(content=original_content)]
url_mappings = agent._process_messsages_and_replace_long_urls_shorter_ones(messages)
# Verify URL was shortened in input
assert len(url_mappings) == 1
shortened_url = next(iter(url_mappings.keys()))
assert url_mappings[shortened_url] == SUPER_LONG_URL
assert shortened_url in (messages[0].content or '')
# Step 2: Simulate agent output with shortened URL
output_json = {
'thinking': f'I will navigate to {shortened_url} to get the documentation',
'evaluation_previous_goal': 'Starting documentation extraction',
'memory': f'Target URL: {shortened_url}',
'next_goal': 'Extract API documentation',
'action': [{'go_to_url': {'url': shortened_url, 'new_tab': True}}],
}
# Create AgentOutput with custom actions
tools = agent.tools
ActionModel = tools.registry.create_action_model()
AgentOutputWithActions = AgentOutput.type_with_custom_actions(ActionModel)
agent_output = AgentOutputWithActions.model_validate_json(json.dumps(output_json))
# Step 3: Output processing with URL restoration (modifies agent_output in-place)
agent._recursive_process_all_strings_inside_pydantic_model(agent_output, url_mappings)
# Verify complete pipeline worked correctly
assert SUPER_LONG_URL in (agent_output.thinking or '')
assert SUPER_LONG_URL in (agent_output.memory or '')
action_data = agent_output.action[0].model_dump()
assert action_data['go_to_url']['url'] == SUPER_LONG_URL
assert action_data['go_to_url']['new_tab'] is True
# Verify original shortened content is no longer present
assert shortened_url not in (agent_output.thinking or '')
assert shortened_url not in (agent_output.memory or '')