diff --git a/.env.example b/.env.example index 3fe667fd6..ef0fe4736 100644 --- a/.env.example +++ b/.env.example @@ -1,64 +1,51 @@ -# Browser Use Environment Configuration -# Copy this file to .env and configure your API keys and settings +# Browser Use Configuration +# Copy this file to .env and fill in your values -# ============================================================================= -# API Keys for Language Models -# ============================================================================= -OPENAI_API_KEY= -ANTHROPIC_API_KEY= -GOOGLE_API_KEY= -DEEPSEEK_API_KEY= -GROK_API_KEY= -NOVITA_API_KEY= - -# Azure OpenAI Configuration -AZURE_OPENAI_ENDPOINT= -AZURE_OPENAI_KEY= - -# ============================================================================= # Logging Configuration -# ============================================================================= -# Browser Use logging level (debug, info, warning, error) +# Set the logging level (debug, info, warning, error) BROWSER_USE_LOGGING_LEVEL=info -# CDP (Chrome DevTools Protocol) logging level for cdp_use library -# Controls logging verbosity of Chrome DevTools Protocol interactions -# Recommended: WARNING to reduce noise (debug, info, warning, error) +# Log file paths (optional) +# Save debug level logs to this file +BROWSER_USE_DEBUG_LOG_FILE=debug.log + +# Save info level logs to this file +BROWSER_USE_INFO_LOG_FILE=info.log + +# CDP (Chrome DevTools Protocol) logging level CDP_LOGGING_LEVEL=WARNING -# ============================================================================= -# Telemetry and Cloud Configuration -# ============================================================================= -# Enable anonymous telemetry collection +# Telemetry and Analytics +# Enable/disable anonymous telemetry ANONYMIZED_TELEMETRY=true -# Browser Use Cloud Configuration -BROWSER_USE_CLOUD_SYNC= -BROWSER_USE_CLOUD_API_URL=https://api.browser-use.com -BROWSER_USE_CLOUD_UI_URL= +# Browser Use Cloud Configuration (optional) +# Your Browser Use Cloud API key - get it from: https://cloud.browser-use.com/billing +# BROWSER_USE_API_KEY=your_api_key_here -# ============================================================================= -# Development and Runtime Configuration -# ============================================================================= -# Skip LLM API key verification during initialization -SKIP_LLM_API_KEY_VERIFICATION=false +# Custom API base URL (for enterprise installations) +# BROWSER_USE_CLOUD_API_URL=https://api.browser-use.com -# Runtime environment flags -IN_DOCKER= -IS_IN_EVALS=false +# Cloud sync settings +# BROWSER_USE_CLOUD_SYNC=false -# Path configuration -XDG_CACHE_HOME=~/.cache -XDG_CONFIG_HOME=~/.config -BROWSER_USE_CONFIG_DIR= +# Model Configuration +# Default LLM model to use +# OPENAI_API_KEY=your_openai_api_key_here +# ANTHROPIC_API_KEY=your_anthropic_api_key_here -# Windows font directory (Windows only) -WIN_FONT_DIR=C:\Windows\Fonts +# Browser Configuration +# Path to Chrome/Chromium executable (optional) +# BROWSER_USE_EXECUTABLE_PATH=/path/to/chrome -# ============================================================================= -# MCP (Model Context Protocol) Configuration -# ============================================================================= -BROWSER_USE_CONFIG_PATH= -BROWSER_USE_HEADLESS= -BROWSER_USE_ALLOWED_DOMAINS= -BROWSER_USE_LLM_MODEL= +# Run browser in headless mode +# BROWSER_USE_HEADLESS=false + +# User data directory for browser profile +# BROWSER_USE_USER_DATA_DIR=./browser_data + +# Proxy Configuration (optional) +# BROWSER_USE_PROXY_SERVER=http://proxy.example.com:8080 +# BROWSER_USE_NO_PROXY=localhost,127.0.0.1,*.internal +# BROWSER_USE_PROXY_USERNAME=username +# BROWSER_USE_PROXY_PASSWORD=password diff --git a/.gitignore b/.gitignore index bab0e6265..ffa0cf9ef 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,6 @@ credentials.json token.json !docs/docs.json + + +temp-profile-* \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 612128fae..d3bb348bc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ repos: - tomli - repo: https://github.com/asottile/pyupgrade - rev: v3.19.1 + rev: v3.20.0 hooks: - id: pyupgrade args: [--py311-plus] @@ -23,19 +23,20 @@ repos: # - id: add-trailing-comma - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.11.2 + rev: v0.12.10 hooks: - - id: ruff + - id: ruff-check + args: [ --fix ] - id: ruff-format # see pyproject.toml for more details on ruff config - repo: https://github.com/RobertCraigie/pyright-python - rev: v1.1.403 + rev: v1.1.404 hooks: - id: pyright - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: # check for basic syntax errors in python and data files - id: check-ast diff --git a/browser_use/__init__.py b/browser_use/__init__.py index 848e4f8e9..9bc489531 100644 --- a/browser_use/__init__.py +++ b/browser_use/__init__.py @@ -5,7 +5,14 @@ from browser_use.logging_config import setup_logging # Only set up logging if not in MCP mode or if explicitly requested if os.environ.get('BROWSER_USE_SETUP_LOGGING', 'true').lower() != 'false': - logger = setup_logging() + from browser_use.config import CONFIG + + # Get log file paths from config/environment + debug_log_file = getattr(CONFIG, 'BROWSER_USE_DEBUG_LOG_FILE', None) + info_log_file = getattr(CONFIG, 'BROWSER_USE_INFO_LOG_FILE', None) + + # Set up logging with file handlers if specified + logger = setup_logging(debug_log_file=debug_log_file, info_log_file=info_log_file) else: import logging @@ -42,6 +49,7 @@ if TYPE_CHECKING: from browser_use.agent.service import Agent from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList from browser_use.browser import BrowserProfile, BrowserSession + from browser_use.browser import BrowserSession as Browser from browser_use.controller.service import Controller from browser_use.dom.service import DomService from browser_use.llm.anthropic.chat import ChatAnthropic @@ -64,6 +72,7 @@ _LAZY_IMPORTS = { 'AgentHistoryList': ('browser_use.agent.views', 'AgentHistoryList'), # Browser components (heavy due to playwright/patchright) 'BrowserSession': ('browser_use.browser', 'BrowserSession'), + 'Browser': ('browser_use.browser', 'BrowserSession'), # Alias for BrowserSession 'BrowserProfile': ('browser_use.browser', 'BrowserProfile'), # Controller (moderate weight) 'Controller': ('browser_use.controller.service', 'Controller'), @@ -100,6 +109,7 @@ def __getattr__(name: str): __all__ = [ 'Agent', 'BrowserSession', + 'Browser', # Alias for BrowserSession 'BrowserProfile', 'Controller', 'DomService', diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 197e0477c..727286145 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -34,6 +34,8 @@ from bubus import EventBus from pydantic import ValidationError from uuid_extensions import uuid7str +from browser_use import Browser, BrowserProfile, BrowserSession + # Lazy import for gif to avoid heavy agent.views import at startup # from browser_use.agent.gif import create_history_gif from browser_use.agent.message_manager.service import ( @@ -53,7 +55,6 @@ from browser_use.agent.views import ( BrowserStateHistory, StepMetadata, ) -from browser_use.browser import BrowserProfile, BrowserSession from browser_use.browser.session import DEFAULT_BROWSER_PROFILE from browser_use.browser.views import BrowserStateSummary from browser_use.config import CONFIG @@ -134,6 +135,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Optional parameters browser_profile: BrowserProfile | None = None, browser_session: BrowserSession | None = None, + browser: Browser | None = None, # Alias for browser_session (cleaner naming) controller: Controller[Context] | None = None, # Initial agent run parameters sensitive_data: dict[str, str | dict[str, str]] | None = None, @@ -323,7 +325,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): logger.debug( f'{" +vision" if self.settings.use_vision else ""}' f' extraction_model={self.settings.page_extraction_llm.model if self.settings.page_extraction_llm else "Unknown"}' - # Note: No longer logging planner_model (deprecated) f'{" +file_system" if self.file_system else ""}' ) @@ -357,6 +358,11 @@ class Agent(Generic[Context, AgentStructuredOutput]): browser_profile = browser_profile or DEFAULT_BROWSER_PROFILE + # Handle browser vs browser_session parameter (browser takes precedence) + if browser and browser_session: + raise ValueError('Cannot specify both "browser" and "browser_session" parameters. Use "browser" for the cleaner API.') + browser_session = browser or browser_session + self.browser_session = browser_session or BrowserSession( browser_profile=browser_profile, id=uuid7str()[:-4] + self.id[-4:], # re-use the same 4-char suffix so they show up together in logs @@ -466,13 +472,13 @@ class Agent(Generic[Context, AgentStructuredOutput]): def logger(self) -> logging.Logger: """Get instance-specific logger with task ID in the name""" - _browser_session_id = self.browser_session.id if self.browser_session else self.id + _browser_session_id = self.browser_session.id if self.browser_session else '----' _current_target_id = ( - self.browser_session.agent_focus.target_id[-4:] + self.browser_session.agent_focus.target_id[-2:] if self.browser_session and self.browser_session.agent_focus and self.browser_session.agent_focus.target_id else '--' ) - return logging.getLogger(f'browser_use.Agent🅰 {self.task_id[-4:]} on 🆂 {_browser_session_id[-4:]} 🅟 {_current_target_id}') + return logging.getLogger(f'browser_use.Agent🅰 {self.task_id[-4:]} ⇢ 🅑 {_browser_session_id[-4:]} 🅣 {_current_target_id}') @property def browser_profile(self) -> BrowserProfile: @@ -638,6 +644,13 @@ class Agent(Generic[Context, AgentStructuredOutput]): # The task continues with new instructions, it doesn't end and start a new one self.task = new_task self._message_manager.add_new_task(new_task) + # Mark as follow-up task and recreate eventbus (gets shut down after each run) + self.state.follow_up_task = True + self.eventbus = EventBus(name=f'Agent_{str(self.id)[-self.state.n_steps :]}') + + # Re-register cloud sync handler if it exists (if not disabled) + if hasattr(self, 'cloud_sync') and self.cloud_sync and self.enable_cloud_sync: + self.eventbus.on('*', self.cloud_sync.handle_event) @observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused') async def _raise_if_stopped_or_paused(self) -> None: @@ -1217,22 +1230,33 @@ class Agent(Generic[Context, AgentStructuredOutput]): r'(?:www\.)?[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-zA-Z]{2,}(?:/[^\s<>"\']*)?', # Domain names with subdomains and optional paths ] + # Email pattern to exclude + email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' + + found_urls = [] for pattern in patterns: - match = re.search(pattern, task) - if match: + matches = re.finditer(pattern, task) + for match in matches: url = match.group(0) + # Skip if this looks like an email address + if re.search(email_pattern, url): + continue # Remove trailing punctuation that's not part of URLs url = re.sub(r'[.,;:!?()\[\]]+$', '', url) # Add https:// if missing if not url.startswith(('http://', 'https://')): url = 'https://' + url - return url + found_urls.append(url) - # If no URL found, check if task mentions Google or search - task_lower = task.lower() - if 'google' in task_lower or 'search' in task_lower: - self.logger.debug('📍 Task mentions "google" or "search", defaulting to https://google.com') - return 'https://google.com' + unique_urls = list(set(found_urls)) + # If multiple URLs found, skip preloading + if len(unique_urls) > 1: + self.logger.debug(f'📍 Multiple URLs found ({len(found_urls)}), skipping preload to avoid ambiguity') + return None + + # If exactly one URL found, return it + if len(unique_urls) == 1: + return unique_urls[0] return None @@ -1274,7 +1298,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self._log_agent_run() self.logger.debug( - f'🔧 Agent setup: Task ID {self.task_id[-4:]}, Session ID {self.session_id[-4:]}, Browser Session ID {self.browser_session.id[-4:] if self.browser_session else "None"}' + f'🔧 Agent setup: Agent Session ID {self.session_id[-4:]}, Task ID {self.task_id[-4:]}, Browser Session ID {self.browser_session.id[-4:] if self.browser_session else "None"} {"(connecting via CDP)" if (self.browser_session and self.browser_session.cdp_url) else "(launching local browser)"}' ) # Initialize timing for session and task @@ -1304,7 +1328,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug('🔧 Browser session started with watchdogs attached') # Check if task contains a URL and add it as an initial action (only if preload is enabled) - if self.preload: + if self.preload and not self.state.follow_up_task: initial_url = self._extract_url_from_task(self.task) if initial_url: self.logger.info(f'🔗 Found URL in task: {initial_url}, adding as initial action...') @@ -1337,7 +1361,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.logger.debug(f'✅ Added navigation to {initial_url} as initial action') # Execute initial actions if provided - if self.initial_actions: + if self.initial_actions and not self.state.follow_up_task: self.logger.debug(f'⚡ Executing {len(self.initial_actions)} initial actions...') result = await self.multi_act(self.initial_actions, check_for_new_elements=False) self.state.last_result = result @@ -1499,7 +1523,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): # Stop the event bus gracefully, waiting for all events to be processed # Use longer timeout to avoid deadlocks in tests with multiple agents - await self.eventbus.stop(timeout=10.0) + await self.eventbus.stop(timeout=3.0) await self.close() diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index 1bbc570aa..0a4afeb29 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -70,6 +70,7 @@ class AgentState(BaseModel): paused: bool = False stopped: bool = False session_initialized: bool = False # Track if session events have been dispatched + follow_up_task: bool = False # Track if the agent is a follow-up task message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState) file_system_state: FileSystemState | None = None diff --git a/browser_use/browser/__init__.py b/browser_use/browser/__init__.py index 48d852010..4ef9bf93b 100644 --- a/browser_use/browser/__init__.py +++ b/browser_use/browser/__init__.py @@ -5,6 +5,7 @@ if TYPE_CHECKING: from .profile import BrowserProfile, ProxySettings from .session import BrowserSession + # Lazy imports mapping for heavy browser components _LAZY_IMPORTS = { 'ProxySettings': ('.profile', 'ProxySettings'), diff --git a/browser_use/browser/events.py b/browser_use/browser/events.py index 3b2522cb1..c84f6518d 100644 --- a/browser_use/browser/events.py +++ b/browser_use/browser/events.py @@ -41,6 +41,7 @@ class ElementSelectedEvent(BaseEvent[T_EventResultType]): is_visible=data.is_visible, absolute_position=data.absolute_position, # override the circular reference fields in EnhancedDOMTreeNode as they cant be serialized and aren't needed by event handlers + # only used internally by the DOM service during DOM tree building process, not intended public API use content_document=None, shadow_root_type=None, shadow_roots=[], @@ -86,7 +87,7 @@ class NavigateToUrlEvent(BaseEvent[None]): ) # existing_tab: PageHandle | None = None # TODO - # limit enforced by bubus, not exposed to LLM: + # time limits enforced by bubus, not exposed to LLM: event_timeout: float | None = 15.0 # seconds diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index 06b135497..c05906a4c 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -9,7 +9,6 @@ from typing import Annotated, Any, Literal, Self from urllib.parse import urlparse from pydantic import AfterValidator, AliasChoices, BaseModel, ConfigDict, Field, field_validator, model_validator -from uuid_extensions import uuid7str from browser_use.config import CONFIG from browser_use.observability import observe_debug @@ -596,8 +595,9 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro # ... extends options defined in: # BrowserLaunchPersistentContextArgs, BrowserLaunchArgs, BrowserNewContextArgs, BrowserConnectArgs - # Unique identifier for this browser profile - id: str = Field(default_factory=uuid7str) + # Session/connection configuration + cdp_url: str | None = Field(default=None, description='CDP URL for connecting to existing browser instance') + is_local: bool = Field(default=True, description='Whether this is a local browser instance') # label: str = 'default' # custom options we provide that aren't native playwright kwargs @@ -673,10 +673,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro def __repr__(self) -> str: short_dir = _log_pretty_path(self.user_data_dir) if self.user_data_dir else '' - return f'BrowserProfile#{self.id[-4:]}(user_data_dir= {short_dir}, headless={self.headless})' + return f'BrowserProfile(user_data_dir= {short_dir}, headless={self.headless})' def __str__(self) -> str: - return f'BrowserProfile#{self.id[-4:]}' + return 'BrowserProfile' @model_validator(mode='after') def copy_old_config_names_to_new(self) -> Self: diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index e4d65092e..6c7c9858e 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -2,7 +2,9 @@ import asyncio import logging -from typing import Any, Self, cast +from functools import cached_property +from pathlib import Path +from typing import Any, Literal, Self, cast import httpx from bubus import EventBus @@ -34,7 +36,7 @@ from browser_use.browser.events import ( TabClosedEvent, TabCreatedEvent, ) -from browser_use.browser.profile import BrowserProfile +from browser_use.browser.profile import BrowserProfile, ProxySettings from browser_use.browser.views import BrowserStateSummary, TabInfo from browser_use.dom.views import EnhancedDOMTreeNode, TargetInfo from browser_use.utils import _log_pretty_url, is_new_tab_page @@ -44,6 +46,10 @@ DEFAULT_BROWSER_PROFILE = BrowserProfile() MAX_SCREENSHOT_HEIGHT = 2000 MAX_SCREENSHOT_WIDTH = 1920 +_LOGGED_UNIQUE_SESSION_IDS = set() # track unique session IDs that have been logged to make sure we always assign a unique enough id to new sessions and avoid ambiguity in logs +red = '\033[91m' +reset = '\033[0m' + class CDPSession(BaseModel): """Info about a single CDP session bound to a specific target. @@ -88,7 +94,7 @@ class CDPSession(BaseModel): import logging logger = logging.getLogger(f'browser_use.CDPSession.{target_id[-4:]}') - logger.debug(f'🔌 Creating dedicated WebSocket connection for target {target_id}') + logger.debug(f'🔌 Creating new dedicated WebSocket connection for target 🅣 {target_id}') target_cdp_client = CDPClient(cdp_url) await target_cdp_client.start() @@ -148,7 +154,7 @@ class CDPSession(BaseModel): # if 'Debugger' not in domains: # await self.cdp_client.send.Debugger.disable() # await cdp_session.cdp_client.send.EventBreakpoints.disable(session_id=cdp_session.session_id) - except Exception as e: + except Exception: # self.logger.warning(f'Failed to disable page JS breakpoints: {e}') pass @@ -186,6 +192,19 @@ class BrowserSession(BaseModel): - Direct CDP/Playwright calls for browser operations Supports both event-driven and imperative calling styles. + + Browser configuration is stored in the browser_profile, session identity in direct fields: + ```python + # Direct settings (recommended for most users) + session = BrowserSession(headless=True, user_data_dir='./profile') + + # Or use a profile (for advanced use cases) + session = BrowserSession(browser_profile=BrowserProfile(...)) + + # Access session fields directly, browser settings via profile or property + print(session.id) # Session field + print(session.browser_profile.stealth) # Direct browser_profile access + ``` """ model_config = ConfigDict( @@ -195,16 +214,124 @@ class BrowserSession(BaseModel): revalidate_instances='never', # resets private attrs on every model rebuild ) - # Core configuration - id: str = Field(default_factory=lambda: str(uuid7str())) + def __init__( + self, + # Core configuration + id: str | None = None, + cdp_url: str | None = None, + is_local: bool = True, + browser_profile: BrowserProfile | None = None, + # BrowserProfile fields that can be passed directly + # From BrowserConnectArgs + headers: dict[str, str] | None = None, + slow_mo: float | None = None, + timeout: float | None = None, + # From BrowserLaunchArgs + env: dict[str, str | float | bool] | None = None, + executable_path: str | Path | None = None, + headless: bool | None = None, + args: list[str] | None = None, + ignore_default_args: list[str] | Literal[True] | None = None, + channel: str | None = None, + chromium_sandbox: bool | None = None, + devtools: bool | None = None, + downloads_path: str | Path | None = None, + traces_dir: str | Path | None = None, + handle_sighup: bool | None = None, + handle_sigint: bool | None = None, + handle_sigterm: bool | None = None, + # From BrowserContextArgs + accept_downloads: bool | None = None, + offline: bool | None = None, + strict_selectors: bool | None = None, + permissions: list[str] | None = None, + bypass_csp: bool | None = None, + extra_http_headers: dict[str, str] | None = None, + ignore_https_errors: bool | None = None, + java_script_enabled: bool | None = None, + base_url: str | None = None, + service_workers: str | None = None, + user_agent: str | None = None, + screen: dict | None = None, + viewport: dict | None = None, + no_viewport: bool | None = None, + device_scale_factor: float | None = None, + is_mobile: bool | None = None, + has_touch: bool | None = None, + locale: str | None = None, + timezone_id: str | None = None, + color_scheme: str | None = None, + contrast: str | None = None, + reduced_motion: str | None = None, + forced_colors: str | None = None, + record_har_content: str | None = None, + record_har_mode: str | None = None, + record_har_omit_content: bool | None = None, + record_har_path: str | Path | None = None, + record_har_url_filter: str | None = None, + record_video_dir: str | Path | None = None, + record_video_size: dict | None = None, + # From BrowserLaunchPersistentContextArgs + user_data_dir: str | Path | None = None, + # From BrowserNewContextArgs + storage_state: str | Path | dict[str, Any] | None = None, + # BrowserProfile specific fields + stealth: bool | None = None, + disable_security: bool | None = None, + deterministic_rendering: bool | None = None, + allowed_domains: list[str] | None = None, + keep_alive: bool | None = None, + proxy: ProxySettings | None = None, + enable_default_extensions: bool | None = None, + window_size: dict | None = None, + window_position: dict | None = None, + cross_origin_iframes: bool | None = None, + default_navigation_timeout: float | None = None, + default_timeout: float | None = None, + minimum_wait_page_load_time: float | None = None, + wait_for_network_idle_page_load_time: float | None = None, + maximum_wait_page_load_time: float | None = None, + wait_between_actions: float | None = None, + include_dynamic_attributes: bool | None = None, + highlight_elements: bool | None = None, + viewport_expansion: int | None = None, + auto_download_pdfs: bool | None = None, + profile_directory: str | None = None, + cookies_file: Path | None = None, + ): + # Following the same pattern as AgentSettings in service.py + # Only pass non-None values to avoid validation errors + profile_kwargs = {k: v for k, v in locals().items() if k not in ['self', 'browser_profile', 'id'] and v is not None} - cdp_url: str | None = None - is_local: bool = Field(default=True) + # Create browser profile from direct parameters or use provided one + resolved_browser_profile = browser_profile or BrowserProfile(**profile_kwargs) + + # Initialize the Pydantic model + super().__init__( + id=id or str(uuid7str()), + browser_profile=resolved_browser_profile, + ) + + # Session configuration (session identity only) + id: str = Field(default_factory=lambda: str(uuid7str()), description='Unique identifier for this browser session') + + # Browser configuration (reusable profile) browser_profile: BrowserProfile = Field( default_factory=lambda: DEFAULT_BROWSER_PROFILE, description='BrowserProfile() options to use for the session, otherwise a default profile will be used', ) + # Convenience properties for common browser settings + @property + def cdp_url(self) -> str | None: + """CDP URL from browser profile.""" + return self.browser_profile.cdp_url + + @property + def is_local(self) -> bool: + """Whether this is a local browser instance from browser profile.""" + return self.browser_profile.is_local + # Main shared event bus for all browser session + all watchdogs event_bus: EventBus = Field(default_factory=EventBus) @@ -240,14 +367,28 @@ class BrowserSession(BaseModel): # self._logger = logging.getLogger(f'browser_use.{self}') return logging.getLogger(f'browser_use.{self}') + @cached_property + def _id_for_logs(self) -> str: + """Get human-friendly semi-unique identifier for differentiating different BrowserSession instances in logs""" + str_id = self.id[-4:] # default to last 4 chars of truly random uuid, less helpful than cdp port but always unique enough + port_number = (self.cdp_url or 'no-cdp').rsplit(':', 1)[-1].split('/', 1)[0].strip() + port_is_random = not port_number.startswith('922') + port_is_unique_enough = port_number not in _LOGGED_UNIQUE_SESSION_IDS + if port_number and port_number.isdigit() and port_is_random and port_is_unique_enough: + # if cdp port is random/unique enough to identify this session, use it as our id in logs + _LOGGED_UNIQUE_SESSION_IDS.add(port_number) + str_id = port_number + return str_id + + @property + def _tab_id_for_logs(self) -> str: + return self.agent_focus.target_id[-2:] if self.agent_focus and self.agent_focus.target_id else f'{red}--{reset}' + def __repr__(self) -> str: - port_number = (self.cdp_url or 'no-cdp').rsplit(':', 1)[-1].split('/', 1)[0] - return f'BrowserSession🆂 {self.id[-4:]}:{port_number} #{str(id(self))[-2:]} (cdp_url={self.cdp_url}, profile={self.browser_profile})' + return f'BrowserSession🅑 {self._id_for_logs} 🅣 {self._tab_id_for_logs} (cdp_url={self.cdp_url}, profile={self.browser_profile})' def __str__(self) -> str: - # Note: _original_browser_session tracking moved to Agent class - port_number = (self.cdp_url or 'no-cdp').rsplit(':', 1)[-1].split('/', 1)[0] - return f'BrowserSession🆂 {self.id[-4:]}:{port_number} #{str(id(self))[-2:]}' # ' 🅟 {str(id(self.cdp_session.target_id))[-2:]}' + return f'BrowserSession🅑 {self._id_for_logs} 🅣 {self._tab_id_for_logs}' async def reset(self) -> None: """Clear all cached CDP sessions with proper cleanup.""" @@ -269,7 +410,7 @@ class BrowserSession(BaseModel): self.agent_focus = None if self.is_local: - self.cdp_url = None + self.browser_profile.cdp_url = None self._crash_watchdog = None self._downloads_watchdog = None @@ -374,7 +515,7 @@ class BrowserSession(BaseModel): launch_result: BrowserLaunchResult = cast( BrowserLaunchResult, await launch_event.event_result(raise_if_none=True, raise_if_any=True) ) - self.cdp_url = launch_result.cdp_url + self.browser_profile.cdp_url = launch_result.cdp_url else: raise ValueError('Got BrowserSession(is_local=False) but no cdp_url was provided to connect to!') @@ -646,7 +787,7 @@ class BrowserSession(BaseModel): # Reset state if self.is_local: - self.cdp_url = None + self.browser_profile.cdp_url = None # Notify stop and wait for all handlers to complete # LocalBrowserWatchdog listens for BrowserStopEvent and dispatches BrowserKillEvent @@ -795,17 +936,17 @@ class BrowserSession(BaseModel): self.logger.debug('Watchdogs already attached, skipping duplicate attachment') return - from browser_use.browser.aboutblank_watchdog import AboutBlankWatchdog + from browser_use.browser.watchdogs.aboutblank_watchdog import AboutBlankWatchdog # from browser_use.browser.crash_watchdog import CrashWatchdog - from browser_use.browser.default_action_watchdog import DefaultActionWatchdog - from browser_use.browser.dom_watchdog import DOMWatchdog - from browser_use.browser.downloads_watchdog import DownloadsWatchdog - from browser_use.browser.local_browser_watchdog import LocalBrowserWatchdog - from browser_use.browser.permissions_watchdog import PermissionsWatchdog - from browser_use.browser.popups_watchdog import PopupsWatchdog - from browser_use.browser.screenshot_watchdog import ScreenshotWatchdog - from browser_use.browser.security_watchdog import SecurityWatchdog + from browser_use.browser.watchdogs.default_action_watchdog import DefaultActionWatchdog + from browser_use.browser.watchdogs.dom_watchdog import DOMWatchdog + from browser_use.browser.watchdogs.downloads_watchdog import DownloadsWatchdog + from browser_use.browser.watchdogs.local_browser_watchdog import LocalBrowserWatchdog + from browser_use.browser.watchdogs.permissions_watchdog import PermissionsWatchdog + from browser_use.browser.watchdogs.popups_watchdog import PopupsWatchdog + from browser_use.browser.watchdogs.screenshot_watchdog import ScreenshotWatchdog + from browser_use.browser.watchdogs.security_watchdog import SecurityWatchdog # from browser_use.browser.storage_state_watchdog import StorageStateWatchdog # Initialize CrashWatchdog @@ -912,7 +1053,7 @@ class BrowserSession(BaseModel): This MUST succeed or the browser is unusable. Fails hard on any error. """ - self.cdp_url = cdp_url or self.cdp_url + self.browser_profile.cdp_url = cdp_url or self.cdp_url if not self.cdp_url: raise RuntimeError('Cannot setup CDP connection without CDP URL') @@ -925,7 +1066,7 @@ class BrowserSession(BaseModel): # Run a tiny HTTP client to query for the WebSocket URL from the /json/version endpoint async with httpx.AsyncClient() as client: version_info = await client.get(url) - self.cdp_url = version_info.json()['webSocketDebuggerUrl'] + self.browser_profile.cdp_url = version_info.json()['webSocketDebuggerUrl'] assert self.cdp_url is not None @@ -1940,29 +2081,3 @@ class BrowserSession(BaseModel): self.logger.debug(f'Failed to get CDP client for target {node.target_id}: {e}, using main session') return await self.get_or_create_cdp_session() - - -# # Fix Pydantic circular dependency for all watchdogs -# # This must be called after BrowserSession class is fully defined -# _watchdog_modules = [ -# 'browser_use.browser.crash_watchdog.CrashWatchdog', -# 'browser_use.browser.downloads_watchdog.DownloadsWatchdog', -# 'browser_use.browser.local_browser_watchdog.LocalBrowserWatchdog', -# 'browser_use.browser.storage_state_watchdog.StorageStateWatchdog', -# 'browser_use.browser.security_watchdog.SecurityWatchdog', -# 'browser_use.browser.aboutblank_watchdog.AboutBlankWatchdog', -# 'browser_use.browser.popups_watchdog.PopupsWatchdog', -# 'browser_use.browser.permissions_watchdog.PermissionsWatchdog', -# 'browser_use.browser.default_action_watchdog.DefaultActionWatchdog', -# 'browser_use.browser.dom_watchdog.DOMWatchdog', -# 'browser_use.browser.screenshot_watchdog.ScreenshotWatchdog', -# ] - -# for module_path in _watchdog_modules: -# try: -# module_name, class_name = module_path.rsplit('.', 1) -# module = __import__(module_name, fromlist=[class_name]) -# watchdog_class = getattr(module, class_name) -# watchdog_class.model_rebuild() -# except Exception: -# pass # Ignore if watchdog can't be imported or rebuilt diff --git a/browser_use/browser/types.py b/browser_use/browser/watchdogs/__init__.py similarity index 100% rename from browser_use/browser/types.py rename to browser_use/browser/watchdogs/__init__.py diff --git a/browser_use/browser/aboutblank_watchdog.py b/browser_use/browser/watchdogs/aboutblank_watchdog.py similarity index 100% rename from browser_use/browser/aboutblank_watchdog.py rename to browser_use/browser/watchdogs/aboutblank_watchdog.py diff --git a/browser_use/browser/crash_watchdog.py b/browser_use/browser/watchdogs/crash_watchdog.py similarity index 100% rename from browser_use/browser/crash_watchdog.py rename to browser_use/browser/watchdogs/crash_watchdog.py diff --git a/browser_use/browser/default_action_watchdog.py b/browser_use/browser/watchdogs/default_action_watchdog.py similarity index 99% rename from browser_use/browser/default_action_watchdog.py rename to browser_use/browser/watchdogs/default_action_watchdog.py index f58131516..476b629a4 100644 --- a/browser_use/browser/default_action_watchdog.py +++ b/browser_use/browser/watchdogs/default_action_watchdog.py @@ -495,7 +495,7 @@ class DefaultActionWatchdog(BaseWatchdog): self.logger.debug('🖱️ Clicked successfully using x,y coordinates') # Return coordinates as dict for metadata - return {"click_x": center_x, "click_y": center_y} + return {'click_x': center_x, 'click_y': center_y} except Exception as e: self.logger.warning(f'CDP click failed: {type(e).__name__}: {e}') @@ -673,7 +673,7 @@ class DefaultActionWatchdog(BaseWatchdog): # Get element info backend_node_id = element_node.backend_node_id - + # Track coordinates for metadata input_coordinates = None @@ -707,7 +707,7 @@ class DefaultActionWatchdog(BaseWatchdog): if bounds.get('width', 0) > 0 and bounds.get('height', 0) > 0: center_x = bounds['x'] + bounds['width'] / 2 center_y = bounds['y'] + bounds['height'] / 2 - input_coordinates = {"input_x": center_x, "input_y": center_y} + input_coordinates = {'input_x': center_x, 'input_y': center_y} self.logger.debug(f'📍 Input coordinates: x={center_x:.1f}, y={center_y:.1f}') # Provide helpful warnings for common issues @@ -837,7 +837,7 @@ class DefaultActionWatchdog(BaseWatchdog): ) # Small delay between characters await asyncio.sleep(0.01) - + # Return coordinates metadata if available return input_coordinates @@ -1293,6 +1293,9 @@ class DefaultActionWatchdog(BaseWatchdog): async def on_ScrollToTextEvent(self, event: ScrollToTextEvent) -> None: """Handle scroll to text request with CDP. Raises exception if text not found.""" + + # TODO: handle looking for text inside cross-origin iframes as well + # Get CDP client and session cdp_client = self.browser_session.cdp_client if self.browser_session.agent_focus is None: diff --git a/browser_use/browser/dom_watchdog.py b/browser_use/browser/watchdogs/dom_watchdog.py similarity index 99% rename from browser_use/browser/dom_watchdog.py rename to browser_use/browser/watchdogs/dom_watchdog.py index 54210b7a9..f17bfbd0c 100644 --- a/browser_use/browser/dom_watchdog.py +++ b/browser_use/browser/watchdogs/dom_watchdog.py @@ -411,7 +411,11 @@ class DOMWatchdog(BaseWatchdog): # Create or reuse DOM service if self._dom_service is None: # self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Creating DomService...') - self._dom_service = DomService(browser_session=self.browser_session, logger=self.logger) + self._dom_service = DomService( + browser_session=self.browser_session, + logger=self.logger, + cross_origin_iframes=self.browser_session.browser_profile.cross_origin_iframes, + ) # self.logger.debug('🔍 DOMWatchdog._build_dom_tree: ✅ DomService created') # else: # self.logger.debug('🔍 DOMWatchdog._build_dom_tree: Reusing existing DomService') diff --git a/browser_use/browser/downloads_watchdog.py b/browser_use/browser/watchdogs/downloads_watchdog.py similarity index 99% rename from browser_use/browser/downloads_watchdog.py rename to browser_use/browser/watchdogs/downloads_watchdog.py index 2ed07cab0..f0bd5c986 100644 --- a/browser_use/browser/downloads_watchdog.py +++ b/browser_use/browser/watchdogs/downloads_watchdog.py @@ -269,9 +269,16 @@ class DownloadsWatchdog(BaseWatchdog): self.browser_session.browser_profile.downloads_path or f'{tempfile.gettempdir()}/browser_use_downloads.{str(self.browser_session.id)[-4:]}' ) + + # Initialize variables that may be used outside try blocks + unique_filename = None + file_size = 0 + expected_path = None + download_result = None + download_url = event.get('url', '') + suggested_filename = event.get('suggestedFilename', 'download') + try: - download_url = event.get('url', '') - suggested_filename = event.get('suggestedFilename', 'download') guid = event.get('guid', '') self.logger.debug(f'[DownloadsWatchdog] ⬇️ File download starting: {suggested_filename} from {download_url[:100]}...') diff --git a/browser_use/browser/local_browser_watchdog.py b/browser_use/browser/watchdogs/local_browser_watchdog.py similarity index 96% rename from browser_use/browser/local_browser_watchdog.py rename to browser_use/browser/watchdogs/local_browser_watchdog.py index c76a1b3dd..37d036585 100644 --- a/browser_use/browser/local_browser_watchdog.py +++ b/browser_use/browser/watchdogs/local_browser_watchdog.py @@ -46,17 +46,13 @@ class LocalBrowserWatchdog(BaseWatchdog): """Launch a local browser process.""" try: - self.logger.debug( - f'[LocalBrowserWatchdog] Received BrowserLaunchEvent, EventBus ID: {id(self.event_bus)}, launching local browser' - ) + self.logger.debug('[LocalBrowserWatchdog] Received BrowserLaunchEvent, launching local browser...') - self.logger.debug('[LocalBrowserWatchdog] Calling _launch_browser...') + # self.logger.debug('[LocalBrowserWatchdog] Calling _launch_browser...') process, cdp_url = await self._launch_browser() - self.logger.debug(f'[LocalBrowserWatchdog] _launch_browser returned: process={process}, cdp_url={cdp_url}') - self._subprocess = process + # self.logger.debug(f'[LocalBrowserWatchdog] _launch_browser returned: process={process}, cdp_url={cdp_url}') - self.logger.debug(f'[LocalBrowserWatchdog] Browser launched successfully at {cdp_url}, PID: {process.pid}') return BrowserLaunchResult(cdp_url=cdp_url) except Exception as e: self.logger.error(f'[LocalBrowserWatchdog] Exception in on_BrowserLaunchEvent: {e}', exc_info=True) @@ -145,7 +141,9 @@ class LocalBrowserWatchdog(BaseWatchdog): stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) - self.logger.debug(f'[LocalBrowserWatchdog] 🎭 Browser subprocess launched with browser_pid= {subprocess.pid}') + self.logger.debug( + f'[LocalBrowserWatchdog] 🎭 Browser running with browser_pid= {subprocess.pid} 🔗 listening on CDP port :{debug_port}' + ) # Convert to psutil.Process process = psutil.Process(subprocess.pid) diff --git a/browser_use/browser/permissions_watchdog.py b/browser_use/browser/watchdogs/permissions_watchdog.py similarity index 100% rename from browser_use/browser/permissions_watchdog.py rename to browser_use/browser/watchdogs/permissions_watchdog.py diff --git a/browser_use/browser/popups_watchdog.py b/browser_use/browser/watchdogs/popups_watchdog.py similarity index 100% rename from browser_use/browser/popups_watchdog.py rename to browser_use/browser/watchdogs/popups_watchdog.py diff --git a/browser_use/browser/screenshot_watchdog.py b/browser_use/browser/watchdogs/screenshot_watchdog.py similarity index 100% rename from browser_use/browser/screenshot_watchdog.py rename to browser_use/browser/watchdogs/screenshot_watchdog.py diff --git a/browser_use/browser/security_watchdog.py b/browser_use/browser/watchdogs/security_watchdog.py similarity index 100% rename from browser_use/browser/security_watchdog.py rename to browser_use/browser/watchdogs/security_watchdog.py diff --git a/browser_use/browser/storage_state_watchdog.py b/browser_use/browser/watchdogs/storage_state_watchdog.py similarity index 100% rename from browser_use/browser/storage_state_watchdog.py rename to browser_use/browser/watchdogs/storage_state_watchdog.py diff --git a/browser_use/config.py b/browser_use/config.py index 1a53426cc..4114ab93b 100644 --- a/browser_use/config.py +++ b/browser_use/config.py @@ -181,6 +181,8 @@ class FlatEnvConfig(BaseSettings): # Logging and telemetry BROWSER_USE_LOGGING_LEVEL: str = Field(default='info') CDP_LOGGING_LEVEL: str = Field(default='WARNING') + BROWSER_USE_DEBUG_LOG_FILE: str | None = Field(default=None) + BROWSER_USE_INFO_LOG_FILE: str | None = Field(default=None) ANONYMIZED_TELEMETRY: bool = Field(default=True) BROWSER_USE_CLOUD_SYNC: bool | None = Field(default=None) BROWSER_USE_CLOUD_API_URL: str = Field(default='https://api.browser-use.com') @@ -459,9 +461,7 @@ class Config: proxy_dict['server'] = env_config.BROWSER_USE_PROXY_URL if env_config.BROWSER_USE_NO_PROXY: # store bypass as comma-separated string to match Chrome flag - proxy_dict['bypass'] = ','.join( - [d.strip() for d in env_config.BROWSER_USE_NO_PROXY.split(',') if d.strip()] - ) + proxy_dict['bypass'] = ','.join([d.strip() for d in env_config.BROWSER_USE_NO_PROXY.split(',') if d.strip()]) if env_config.BROWSER_USE_PROXY_USERNAME: proxy_dict['username'] = env_config.BROWSER_USE_PROXY_USERNAME if env_config.BROWSER_USE_PROXY_PASSWORD: diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py index a326708e2..9076ece75 100644 --- a/browser_use/dom/service.py +++ b/browser_use/dom/service.py @@ -28,10 +28,6 @@ if TYPE_CHECKING: from browser_use.browser.session import BrowserSession -# TODO: enable cross origin iframes -> experimental for now -ENABLE_CROSS_ORIGIN_IFRAMES = False - - class DomService: """ Service for getting the DOM tree and other DOM-related information. @@ -43,9 +39,12 @@ class DomService: logger: logging.Logger - def __init__(self, browser_session: 'BrowserSession', logger: logging.Logger | None = None): + def __init__( + self, browser_session: 'BrowserSession', logger: logging.Logger | None = None, cross_origin_iframes: bool = False + ): self.browser_session = browser_session self.logger = logger or browser_session.logger + self.cross_origin_iframes = cross_origin_iframes async def __aenter__(self): return self @@ -616,7 +615,7 @@ class DomService: if ( # TODO: hacky way to disable cross origin iframes for now - ENABLE_CROSS_ORIGIN_IFRAMES and node['nodeName'].upper() == 'IFRAME' and node.get('contentDocument', None) is None + self.cross_origin_iframes and node['nodeName'].upper() == 'IFRAME' and node.get('contentDocument', None) is None ): # None meaning there is no content # Use get_all_frames to find the iframe's target frame_id = node.get('frameId', None) diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py index 65662b691..dc3658c5d 100644 --- a/browser_use/logging_config.py +++ b/browser_use/logging_config.py @@ -61,13 +61,15 @@ def addLoggingLevel(levelName, levelNum, methodName=None): setattr(logging, methodName, logToRoot) -def setup_logging(stream=None, log_level=None, force_setup=False): +def setup_logging(stream=None, log_level=None, force_setup=False, debug_log_file=None, info_log_file=None): """Setup logging configuration for browser-use. Args: stream: Output stream for logs (default: sys.stdout). Can be sys.stderr for MCP mode. log_level: Override log level (default: uses CONFIG.BROWSER_USE_LOGGING_LEVEL) force_setup: Force reconfiguration even if handlers already exist + debug_log_file: Path to log file for debug level logs only + info_log_file: Path to log file for info level logs only """ # Try to add RESULT level, but ignore if it already exists try: @@ -94,9 +96,9 @@ def setup_logging(stream=None, log_level=None, force_setup=False): # Only clean up names in INFO mode, keep everything in DEBUG mode if self.log_level > logging.DEBUG and isinstance(record.name, str) and record.name.startswith('browser_use.'): # Extract clean component names from logger names - if 'Agent🅰' in record.name: + if 'Agent' in record.name: record.name = 'Agent' - elif 'BrowserSession🆂' in record.name: + elif 'BrowserSession' in record.name: record.name = 'BrowserSession' elif 'controller' in record.name: record.name = 'controller' @@ -125,32 +127,57 @@ def setup_logging(stream=None, log_level=None, force_setup=False): console.setLevel('RESULT') console.setFormatter(BrowserUseFormatter('%(message)s', log_level)) else: + console.setLevel(log_level) # Keep console at original log level (e.g., INFO) console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s', log_level)) # Configure root logger only root.addHandler(console) - # Configure root logger - root.setLevel(log_level) + # Add file handlers if specified + file_handlers = [] + + # Create debug log file handler + if debug_log_file: + debug_handler = logging.FileHandler(debug_log_file) + debug_handler.setLevel(logging.DEBUG) + debug_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.DEBUG)) + file_handlers.append(debug_handler) + root.addHandler(debug_handler) + + # Create info log file handler + if info_log_file: + info_handler = logging.FileHandler(info_log_file) + info_handler.setLevel(logging.INFO) + info_handler.setFormatter(BrowserUseFormatter('%(asctime)s - %(levelname)-8s [%(name)s] %(message)s', logging.INFO)) + file_handlers.append(info_handler) + root.addHandler(info_handler) + + # Configure root logger - use DEBUG if debug file logging is enabled + effective_log_level = logging.DEBUG if debug_log_file else log_level + root.setLevel(effective_log_level) # Configure browser_use logger browser_use_logger = logging.getLogger('browser_use') browser_use_logger.propagate = False # Don't propagate to root logger browser_use_logger.addHandler(console) - browser_use_logger.setLevel(log_level) + for handler in file_handlers: + browser_use_logger.addHandler(handler) + browser_use_logger.setLevel(effective_log_level) # Configure bubus logger to allow INFO level logs bubus_logger = logging.getLogger('bubus') bubus_logger.propagate = False # Don't propagate to root logger bubus_logger.addHandler(console) - bubus_logger.setLevel(logging.INFO if log_type == 'result' else log_level) + for handler in file_handlers: + bubus_logger.addHandler(handler) + bubus_logger.setLevel(logging.INFO if log_type == 'result' else effective_log_level) # Configure CDP logging using cdp_use's setup function # This enables the formatted CDP output using CDP_LOGGING_LEVEL environment variable # Convert CDP_LOGGING_LEVEL string to logging level cdp_level_str = CONFIG.CDP_LOGGING_LEVEL.upper() cdp_level = getattr(logging, cdp_level_str, logging.WARNING) - + try: from cdp_use.logging import setup_cdp_logging # type: ignore diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py index bfce41ce7..43f9ce4d9 100644 --- a/browser_use/telemetry/views.py +++ b/browser_use/telemetry/views.py @@ -3,6 +3,8 @@ from collections.abc import Sequence from dataclasses import asdict, dataclass from typing import Any +from browser_use.config import is_running_in_docker + @dataclass class BaseTelemetryEvent(ABC): @@ -13,7 +15,10 @@ class BaseTelemetryEvent(ABC): @property def properties(self) -> dict[str, Any]: - return {k: v for k, v in asdict(self).items() if k != 'name'} + props = {k: v for k, v in asdict(self).items() if k != 'name'} + # Add Docker context if running in Docker + props['is_docker'] = is_running_in_docker() + return props @dataclass diff --git a/docs/cli.mdx b/docs/cli.mdx deleted file mode 100644 index f6a98df9a..000000000 --- a/docs/cli.mdx +++ /dev/null @@ -1,239 +0,0 @@ ---- -title: "CLI" -description: "Start using the Browser Use CLI" -icon: "terminal" ---- - -# CLI Usage - -The `browser-use` command-line interface provides multiple modes of operation for browser automation. - -## Installation - -Get started with browser-use immediately using `uvx`: - -```bash -uvx 'browser-use[cli]' --help -``` - -Or install it globally: - -```bash -uv tool install 'browser-use[cli]' -``` - -## Modes of Operation - -### 1. Interactive TUI Mode (Default) - -Launch an interactive terminal UI where you can chat with the browser automation agent: - -```bash -uvx 'browser-use[cli]' -``` - -This opens a chat interface where you can: -- Type natural language commands to control the browser -- See real-time feedback from the agent -- View browser state and actions being performed - -### 2. One-Shot Mode - -Execute a single task without entering interactive mode: - -```bash -uvx browser-use -p "Search for OpenAI documentation and take a screenshot" -``` - -Options: -- `-p, --prompt`: The task to execute -- `--headless`: Run browser in headless mode -- `--model`: Specify LLM model (default: gpt-4o) - -### 3. MCP Server Mode - -Run browser-use as a Model Context Protocol server: - -```bash -uvx 'browser-use[cli]' --mcp # expects MCP JSON RPC over stdio -``` - -This mode exposes browser automation capabilities as MCP tools that can be used by: -- Claude Desktop -- Other MCP-compatible clients -- Custom applications using the MCP SDK - -For MCP integration details, see: -- [MCP Server Documentation](/customize/mcp-server) -- [MCP Client Documentation](/customize/mcp-client) - -## Configuration - -Browser-use can be configured through environment variables and a configuration file. - -### Configuration File Location - -The default configuration file is located at: -- `~/.config/browseruse/config.json` - -You can override this location with: -- `BROWSER_USE_CONFIG_PATH` environment variable -- `BROWSER_USE_CONFIG_DIR` environment variable (directory containing `config.json`) - -### Configuration File Format - -The configuration uses a database-style format with UUID entries: - -```json -{ - "browser_profile": { - "550e8400-e29b-41d4-a716-446655440000": { - "id": "550e8400-e29b-41d4-a716-446655440000", - "default": true, - "created_at": "2024-01-01T00:00:00", - "headless": false, - "user_data_dir": null, - "allowed_domains": ["example.com"], - "downloads_path": "~/Downloads/browser-use" - } - }, - "llm": { - "6ba7b810-9dad-11d1-80b4-00c04fd430c8": { - "id": "6ba7b810-9dad-11d1-80b4-00c04fd430c8", - "default": true, - "created_at": "2024-01-01T00:00:00", - "api_key": "your-openai-api-key-here", - "model": "gpt-4o", - "temperature": 0.7 - } - }, - "agent": { - "6ba7b812-9dad-11d1-80b4-00c04fd430c8": { - "id": "6ba7b812-9dad-11d1-80b4-00c04fd430c8", - "default": true, - "created_at": "2024-01-01T00:00:00", - "max_steps": 100, - "use_vision": true - } - } -} -``` - -Each configuration type (browser_profile, llm, agent) can have multiple entries, with one marked as `default: true`. - -### Environment Variables - -Environment variables always override config.json values: - -#### General Settings -- `BROWSER_USE_LOGGING_LEVEL`: Logging level (debug, info, warning, error) -- `BROWSER_USE_CONFIG_PATH`: Full path to config.json file -- `BROWSER_USE_CONFIG_DIR`: Directory containing config.json - -#### Browser Profile Settings -- `BROWSER_USE_HEADLESS`: Run browser in headless mode (true/false) -- `BROWSER_USE_ALLOWED_DOMAINS`: Comma-separated list of allowed domains -- `BROWSER_USE_USER_DATA_DIR`: Chrome user data directory path - -#### LLM Settings -- `OPENAI_API_KEY`: OpenAI API key -- `ANTHROPIC_API_KEY`: Anthropic API key -- `BROWSER_USE_LLM_MODEL`: LLM model to use (e.g., gpt-4o, claude-3-opus) - -#### MCP-Specific Settings -When running in MCP mode, these environment variables are particularly useful: -- `BROWSER_USE_HEADLESS`: Control browser visibility -- `OPENAI_API_KEY`: Required for agent-based tools - -### Browser Profiles Directory - -Browser profiles are stored in: -``` -~/.config/browseruse/profiles/ -├── default/ # Default browser profile -├── work/ # Custom profile example -└── research/ # Another custom profile -``` - -Each profile directory contains Chrome user data, allowing you to: -- Maintain separate browser sessions -- Keep cookies and local storage isolated -- Use different extensions per profile - -## Examples - -### Basic Usage - -```bash -# Interactive mode -uvx 'browser-use[cli]' - -# One-shot task -uvx 'browser-use[cli]' -p "Go to github.com and search for browser-use" - -# Headless one-shot -uvx 'browser-use[cli]' --headless -p "Extract prices from example.com/products" -``` - -### With Configuration - -```bash -# Use specific config file -BROWSER_USE_CONFIG_PATH=~/my-config.json uvx 'browser-use[cli]' - -# Override settings via environment -BROWSER_USE_HEADLESS=true OPENAI_API_KEY=sk-... uvx 'browser-use[cli]' -p "Check my email" - -# Use different LLM model -BROWSER_USE_LLM_MODEL=gpt-4-turbo uvx 'browser-use[cli]' -``` - -### MCP Server Usage - -```bash -# Start MCP server -uvx 'browser-use[cli]' --mcp - -# With custom settings -BROWSER_USE_HEADLESS=false OPENAI_API_KEY=sk-... uvx 'browser-use[cli]' --mcp -``` - -For Claude Desktop integration, add to your Claude Desktop config: - -```json -{ - "mcpServers": { - "browser-use": { - "command": "uvx", - "args": ["browser-use[cli]", "--mcp"], - "env": { - "OPENAI_API_KEY": "sk-...", - "BROWSER_USE_HEADLESS": "false" - } - } - } -} -``` - -## Troubleshooting - -### Common Issues - -1. **Browser not launching**: Ensure Chrome/Chromium is installed -2. **API key errors**: Set appropriate API key environment variables -3. **Permission errors**: Check file permissions in `~/.config/browseruse/` - -### Debug Mode - -Enable debug logging for troubleshooting: - -```bash -BROWSER_USE_LOGGING_LEVEL=debug uvx 'browser-use[cli]' -``` - -## See Also - -- [Getting Started](/quickstart) -- [MCP Server Documentation](/customize/mcp-server) -- [MCP Client Documentation](/customize/mcp-client) -- [Browser Settings](/customize/browser-settings) diff --git a/docs/cloud/v1/authentication.mdx b/docs/cloud/v1/authentication.mdx index cadd7ef14..3e468bee0 100644 --- a/docs/cloud/v1/authentication.mdx +++ b/docs/cloud/v1/authentication.mdx @@ -2,6 +2,7 @@ title: "Authentication" description: "Learn how to authenticate with the Browser Use Cloud API" icon: "lock" +mode: "wide" --- The Browser Use Cloud API uses API keys to authenticate requests. You can obtain an API key from your [Browser Use Cloud dashboard](https://cloud.browser-use.com/settings/api-keys). diff --git a/docs/cloud/v1/custom-sdk.mdx b/docs/cloud/v1/custom-sdk.mdx index 9b9473e9e..b52a992fe 100644 --- a/docs/cloud/v1/custom-sdk.mdx +++ b/docs/cloud/v1/custom-sdk.mdx @@ -2,6 +2,7 @@ title: "Cloud SDK" description: "Learn how to set up your own Browser Use Cloud SDK" icon: "code" +mode: "wide" --- This guide walks you through setting up your own Browser Use Cloud SDK. diff --git a/docs/cloud/v1/implementation.mdx b/docs/cloud/v1/implementation.mdx index 2d80250f1..37dc68d9e 100644 --- a/docs/cloud/v1/implementation.mdx +++ b/docs/cloud/v1/implementation.mdx @@ -2,6 +2,7 @@ title: "V1 Implementation" description: "Learn how to implement the Browser Use API in Python" icon: "code" +mode: "wide" --- This guide shows how to implement common API patterns using Python. We'll create a complete example that creates and monitors a browser automation task. diff --git a/docs/cloud/v1/n8n-browser-use-integration.mdx b/docs/cloud/v1/n8n-browser-use-integration.mdx index b18641a8b..84a749f0f 100644 --- a/docs/cloud/v1/n8n-browser-use-integration.mdx +++ b/docs/cloud/v1/n8n-browser-use-integration.mdx @@ -2,6 +2,7 @@ title: "N8N + Browser Use Cloud" description: "Learn how to integrate Browser Use Cloud API with n8n using a practical workflow example (competitor research)." icon: "plug" +mode: "wide" --- > **TL;DR** – In **3 minutes** you can have an n8n workflow that: diff --git a/docs/cloud/v1/pricing.mdx b/docs/cloud/v1/pricing.mdx index 85eff0116..98a954672 100644 --- a/docs/cloud/v1/pricing.mdx +++ b/docs/cloud/v1/pricing.mdx @@ -2,6 +2,7 @@ title: "Pricing" description: "Browser Use Cloud API pricing structure and cost breakdown" icon: "dollar-sign" +mode: "wide" --- The Browser Use Cloud API pricing consists of two components: diff --git a/docs/cloud/v1/quickstart.mdx b/docs/cloud/v1/quickstart.mdx index 0027968a6..34129e2fc 100644 --- a/docs/cloud/v1/quickstart.mdx +++ b/docs/cloud/v1/quickstart.mdx @@ -2,6 +2,7 @@ title: "Quickstart" description: "Learn how to get started with the Browser Use Cloud API" icon: "cloud" +mode: "wide" --- diff --git a/docs/cloud/v1/webhooks.mdx b/docs/cloud/v1/webhooks.mdx index 8e97c0cfa..833233c85 100644 --- a/docs/cloud/v1/webhooks.mdx +++ b/docs/cloud/v1/webhooks.mdx @@ -2,6 +2,7 @@ title: "Webhooks" description: "Learn how to integrate webhooks with Browser Use Cloud API" icon: "code" +mode: "wide" --- Webhooks allow you to receive real-time notifications about events in your Browser Use tasks. This guide will show you how to set up and verify webhook endpoints. diff --git a/docs/cloud/v2/node-quickstart.mdx b/docs/cloud/v2/node-quickstart.mdx index 7d67e3aa1..13fd38c25 100644 --- a/docs/cloud/v2/node-quickstart.mdx +++ b/docs/cloud/v2/node-quickstart.mdx @@ -2,6 +2,7 @@ title: "Node.js" description: "Get started with Browser Use Cloud API using Node.js" icon: "node-js" +mode: "wide" --- Browser Use Node.js @@ -62,6 +63,7 @@ const TaskOutput = z.object({ const result = await client.tasks.run({ task: "Search for the top 10 Hacker News posts and return the title and url.", + schema: TaskOutput, }); for (const post of result.parsedOutput.posts) { @@ -85,6 +87,8 @@ const stream = browseruse.tasks.stream({ for await (const msg of stream) { switch (msg.status) { case "started": + console.log(`started: ${msg.data.session.liveUrl}`); + break; case "paused": case "stopped": console.log(`running: ${msg}`); diff --git a/docs/cloud/v2/python-quickstart.mdx b/docs/cloud/v2/python-quickstart.mdx index a23636235..2a749700e 100644 --- a/docs/cloud/v2/python-quickstart.mdx +++ b/docs/cloud/v2/python-quickstart.mdx @@ -2,6 +2,7 @@ title: "Python" description: "Get started with Browser Use Cloud API using Python" icon: "python" +mode: "wide" --- */} > To play around with the API, you can use the [Browser Use Cloud Playground](https://cloud.browser-use.com/playground). + +## Examples + +Explore quick start examples to see how to use the SDKs. + + + + Explore quick start examples for Python. + + + + Explore quick start examples for Typescript. + + + } + href="https://github.com/browser-use/browser-use-examples/tree/main/typescript/scrapper" + > + Explore quick start examples for NextJS. + + diff --git a/docs/customize/agent-basic.mdx b/docs/customize/agent-basic.mdx new file mode 100644 index 000000000..1b7892cf1 --- /dev/null +++ b/docs/customize/agent-basic.mdx @@ -0,0 +1,27 @@ +--- +title: "Basics" +description: "" +icon: "play" +mode: "wide" +--- + + +```python +from browser_use import Agent, ChatOpenAI + +agent = Agent( + task="Search for latest news about AI", + llm=ChatOpenAI(model="gpt-4.1-mini"), +) + +async def main(): + history = await agent.run(max_steps=100) +``` + +- `task`: The task you want to automate. +- `llm`: Your favorite LLM. See Supported Models. + + +The agent is executed using the async `run()` method: + +- `max_steps` (default: `100`): Maximum number of steps the agent can take diff --git a/docs/customize/agent-output-format.mdx b/docs/customize/agent-output-format.mdx new file mode 100644 index 000000000..391487d5d --- /dev/null +++ b/docs/customize/agent-output-format.mdx @@ -0,0 +1,45 @@ +--- +title: "Output Format" +description: "" +icon: "arrow-right-to-bracket" +mode: "wide" +--- + +## Agent History + +The `run()` method returns an `AgentHistoryList` object with the complete execution history: + +```python +history = await agent.run() + +# Access useful information +history.urls() # List of visited URLs +history.screenshot_paths() # List of screenshot paths +history.screenshots() # List of screenshots as base64 strings +history.action_names() # Names of executed actions +history.extracted_content() # List of extracted content from all actions +history.errors() # List of errors (with None for steps without errors) +history.model_actions() # All actions with their parameters +history.model_outputs() # All model outputs from history +history.last_action() # Last action in history + +# Analysis methods +history.final_result() # Get the final extracted content (last step) +history.is_done() # Check if agent completed successfully +history.is_successful() # Check if agent completed successfully (returns None if not done) +history.has_errors() # Check if any errors occurred +history.model_thoughts() # Get the agent's reasoning process (AgentBrain objects) +history.action_results() # Get all ActionResult objects from history +history.action_history() # Get truncated action history with essential fields +history.number_of_steps() # Get the number of steps in the history +history.total_duration_seconds() # Get total duration of all steps in seconds + +# Structured output (when using output_model_schema) +history.structured_output # Property that returns parsed structured output +``` + +See all helper methods in the [AgentHistoryList source code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L301). + +## Structured Output + +For structured output, use the `output_model_schema` parameter with a Pydantic model. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py). diff --git a/docs/customize/agent-parameters.mdx b/docs/customize/agent-parameters.mdx new file mode 100644 index 000000000..58b794e0d --- /dev/null +++ b/docs/customize/agent-parameters.mdx @@ -0,0 +1,50 @@ +--- +title: "All Parameters" +description: "Complete reference for all agent configuration options" +icon: "sliders" +mode: "wide" +--- + +## Available Parameters + +### Core Settings +- `controller`: Registry of [our tools](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py +) the agent can call. [Example for custom tools](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions) +- `browser`: Browser object where you can specify the browser settings. +- `output_model_schema`: Pydantic model class for structured output validation. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) + +### Vision & Processing +- `use_vision` (default: `True`): Enable/disable vision capabilities for processing screenshots +- `vision_detail_level` (default: `'auto'`): Screenshot detail level - `'low'`, `'high'`, or `'auto'` +- `page_extraction_llm`: Separate LLM model for page content extraction. You can choose a small & fast model because it only needs to extract text from the page (default: same as `llm`) + +### Actions & Behavior +- `initial_actions`: List of actions to run before the main task without LLM. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py) +- `max_actions_per_step` (default: `10`): Maximum actions per step, e.g. for form filling the agent can output 10 fields at once. We execute the actions until the page changes. +- `max_failures` (default: `3`): Maximum retries for steps with errors +- `use_thinking` (default: `True`): Controls whether the agent uses its internal "thinking" field for explicit reasoning steps. +- `flash_mode` (default: `False`): Fast mode that skips evaluation, next goal and thinking and only uses memory. If `flash_mode` is enabled, it overrides `use_thinking` and disables the thinking process entirely. [Example](https://github.com/browser-use/browser-use/blob/main/examples/getting_started/05_fast_agent.py) + +### System Messages +- `override_system_message`: Completely replace the default system prompt. +- `extend_system_message`: Add additional instructions to the default system prompt. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_system_prompt.py) + +### File & Data Management +- `save_conversation_path`: Path to save complete conversation history +- `save_conversation_path_encoding` (default: `'utf-8'`): Encoding for saved conversations +- `available_file_paths`: List of file paths the agent can access +- `sensitive_data`: Dictionary of sensitive data to handle carefully. [Example](https://github.com/browser-use/browser-use/blob/main/examples/features/sensitive_data.py) + +### Visual Output +- `generate_gif` (default: `False`): Generate GIF of agent actions. Set to `True` or string path +- `include_attributes`: List of HTML attributes to include in page analysis + +### Performance & Limits +- `max_history_items`: Maximum number of last steps to keep in the LLM memory. If `None`, we keep all steps. +- `llm_timeout` (default: `90`): Timeout in seconds for LLM calls +- `step_timeout` (default: `120`): Timeout in seconds for each step +- `preload` (default: `True`): If we detect a url in the task, we directly open it. + +### Advanced Options +- `calculate_cost` (default: `False`): Calculate and track API costs +- `display_files_in_done_text` (default: `True`): Show file information in completion messages diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx deleted file mode 100644 index 6baa6b8d4..000000000 --- a/docs/customize/agent-settings.mdx +++ /dev/null @@ -1,201 +0,0 @@ ---- -title: "Agent Settings" -description: "Learn how to configure the agent" -icon: "gear" ---- - -## Overview - -The `Agent` class is the core component of Browser Use that handles browser automation. Here are the main configuration options you can use when initializing an agent. - -## Basic Settings - -```python -from browser_use import Agent, ChatOpenAI - -agent = Agent( - task="Search for latest news about AI", - llm=ChatOpenAI(model="gpt-4o"), -) -``` - -### Required Parameters - -- `task`: The instruction for the agent to execute -- `llm`: A chat model instance. See Supported Models for supported models. - -## Agent Behavior - -Control how the agent operates: - -```python -agent = Agent( - task="your task", - llm=llm, - controller=custom_controller, # For custom tool calling - use_vision=True, # Enable vision capabilities - save_conversation_path="logs/conversation" # Save chat logs -) -``` - -### Behavior Parameters - -- `controller`: Registry of functions the agent can call. Defaults to base Controller. See Custom Functions for details. -- `use_vision`: Enable/disable vision capabilities. Defaults to `True`. - - When enabled, the model processes visual information from web pages - - Disable to reduce costs or use models without vision support - - For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size) -- `vision_detail_level`: Controls the detail level of screenshots sent to the vision model. Can be `'low'`, `'high'`, or `'auto'` (default). Using `'low'` can significantly reduce token consumption and cost for simpler visual tasks, while `'high'` provides more detail for complex visual analysis. -- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging. -- `override_system_message`: Completely replace the default system prompt with a custom one. -- `extend_system_message`: Add additional instructions to the default system prompt. - - - Vision capabilities are recommended for better web interaction understanding, - but can be disabled to reduce costs or when using models without vision - support. - - -### Reuse Existing Browser Context - -By default browser-use launches its own builtin browser using playwright chromium. -You can also connect to a remote browser or pass any of the following -existing playwright objects to the Agent: `page`, `browser_context`, `browser`, `browser_session`, or `browser_profile`. - -These all get passed down to create a `BrowserSession` for the `Agent`: - -```python -agent = Agent( - task='book a flight to fiji', - llm=llm, - browser_profile=browser_profile, # use this profile to create a BrowserSession - browser_session=BrowserSession( # use an existing BrowserSession - cdp_url=..., # remote CDP browser to connect to - # or - wss_url=..., # remote wss playwright server provider - # or - browser_pid=... # pid of a locally running browser process to attach to - # or - executable_path=... # provide a custom chrome binary path - # or - channel=... # specify chrome, chromium, ms-edge, etc. - # or - page=page, # use an existing playwright Page object - # or - browser_context=browser_context, # use an existing playwright BrowserContext object - # or - browser=browser, # use an existing playwright Browser object - ), -) -``` - -For example, to connect to an existing browser over CDP you could do: - -```python -agent = Agent( - ... - browser_session=BrowserSession(cdp_url='http://localhost:9222'), -) -``` - -For example, to connect to a local running chrome instance you can do: - -```python -agent = Agent( - ... - browser_session=BrowserSession(browser_pid=1234), -) -``` - -See Connect to your Browser for more info. - - - You can reuse the same `BrowserSession` after an agent has completed running. - If you do nothing, the browser will be automatically closed on `run()` - completion only if it was launched by us. - - -## Running the Agent - -The agent is executed using the async `run()` method: - -- `max_steps` (default: `100`) - Maximum number of steps the agent can take during execution. This prevents infinite loops and helps control execution time. - -## Agent History - -The method returns an `AgentHistoryList` object containing the complete execution history. This history is invaluable for debugging, analysis, and creating reproducible scripts. - -```python -# Example of accessing history -history = await agent.run() - -# Access (some) useful information -history.urls() # List of visited URLs -history.screenshot_paths() # List of screenshot paths -history.action_names() # Names of executed actions -history.extracted_content() # Content extracted during execution -history.errors() # Any errors that occurred -history.model_actions() # All actions with their parameters -``` - -The `AgentHistoryList` provides many helper methods to analyze the execution: - -- `final_result()`: Get the final extracted content -- `is_done()`: Check if the agent completed successfully -- `has_errors()`: Check if any errors occurred -- `model_thoughts()`: Get the agent's reasoning process -- `action_results()`: Get results of all actions - - - For a complete list of helper methods and detailed history analysis - capabilities, refer to the [AgentHistoryList source - code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L111). - - -## Run initial actions without LLM - -With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py) you can run initial actions without the LLM. -Specify the action as a dictionary where the key is the action name and the value is the action parameters. You can find all our actions in the [Controller](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py) source code. - -```python - -initial_actions = [ - {'go_to_url': {'url': 'https://www.google.com', 'new_tab': True}}, - {'go_to_url': {'url': 'https://en.wikipedia.org/wiki/Randomness', 'new_tab': True}}, - {'scroll_down': {'amount': 1000}}, -] -agent = Agent( - task='What theories are displayed on the page?', - initial_actions=initial_actions, - llm=llm, -) -``` - - - - - -### Optional Parameters - -- `initial_actions`: List of initial actions to run before the main task. -- `max_actions_per_step`: Maximum number of actions to run in a step. Defaults to `10`. -- `max_failures`: Maximum number of failures before giving up. Defaults to `3`. -- `retry_delay`: Time to wait between retries in seconds when rate limited. Defaults to `10`. -- `generate_gif`: Enable/disable GIF generation. Defaults to `False`. Set to `True` or a string path to save the GIF. - -## Memory - -Memory management in browser-use has been significantly improved since version 0.3.2. The agent's context handling and state management are now robust enough that the previous memory system (`mem0`) is no longer needed or supported. - -The agent maintains its context and task progress through: - -- Detailed history tracking of actions and results -- Structured state management -- Clear goal setting and evaluation at each step - -The `enable_memory` parameter has been removed as the new system provides better context management by default. - - - If you're upgrading from an older version that used `enable_memory`, simply remove this parameter. The agent will automatically use the improved context management system. - diff --git a/docs/customize/browser-basic.mdx b/docs/customize/browser-basic.mdx new file mode 100644 index 000000000..6149072bb --- /dev/null +++ b/docs/customize/browser-basic.mdx @@ -0,0 +1,27 @@ +--- +title: "Basics" +description: "" +icon: "play" +--- + + +--- + +```python +from browser_use import Agent, Browser, ChatOpenAI + +browser = Browser( + headless=False, # Show browser window + window_size={'width': 1000, 'height': 700}, # Set window size +) + +agent = Agent( + task='Search for Browser Use', + browser=browser, + llm=ChatOpenAI(model='gpt-4.1-mini'), +) + + +async def main(): + await agent.run() +``` diff --git a/docs/customize/browser-parameters.mdx b/docs/customize/browser-parameters.mdx new file mode 100644 index 000000000..3f884684d --- /dev/null +++ b/docs/customize/browser-parameters.mdx @@ -0,0 +1,120 @@ +--- +title: "All Parameters" +description: "Complete reference for all browser configuration options" +icon: "sliders" +mode: "wide" +--- + +## Core Settings +- `cdp_url`: CDP URL for connecting to existing browser instance (e.g., `"http://localhost:9222"`) +- `is_local` (default: `True`): Whether this is a local browser instance. Set to `False` for remote browsers + +## Display & Appearance +- `headless` (default: `None`): Run browser without UI. Auto-detects based on display availability (`True`/`False`/`None`) +- `window_size`: Browser window size for headful mode. Use dict `{'width': 1920, 'height': 1080}` or `ViewportSize` object +- `window_position` (default: `{'width': 0, 'height': 0}`): Window position from top-left corner in pixels +- `viewport`: Content area size, same format as `window_size`. Use `{'width': 1280, 'height': 720}` or `ViewportSize` object +- `no_viewport` (default: `None`): Disable viewport emulation, content fits to window size +- `device_scale_factor`: Device scale factor (DPI). Set to `2.0` or `3.0` for high-resolution screenshots +- `color_scheme` (default: `'light'`): Preferred color scheme (`'light'`, `'dark'`, `'no-preference'`) +- `contrast` (default: `'no-preference'`): High contrast mode (`'no-preference'`, `'more'`) +- `reduced_motion` (default: `'no-preference'`): Motion preference (`'reduce'`, `'no-preference'`) +- `forced_colors` (default: `'none'`): Forced colors mode (`'active'`, `'none'`) + +## Browser Behavior +- `stealth` (default: `False`): Use stealth techniques to avoid bot detection +- `keep_alive` (default: `None`): Keep browser running after agent completes +- `allowed_domains`: Restrict navigation to specific domains. Domain pattern formats: + - `'example.com'` - Matches only `https://example.com/*` + - `'*.example.com'` - Matches `https://example.com/*` and any subdomain `https://*.example.com/*` + - `'http*://example.com'` - Matches both `http://` and `https://` protocols + - `'chrome-extension://*'` - Matches any Chrome extension URL + - **Security**: Wildcards in TLD (e.g., `example.*`) are **not allowed** for security + - Use list like `['*.google.com', 'https://example.com', 'chrome-extension://*']` +- `enable_default_extensions` (default: `True`): Load automation extensions (uBlock Origin, cookie handlers, ClearURLs) +- `cross_origin_iframes` (default: `False`): Enable cross-origin iframe support (may cause complexity) + +## User Data & Profiles +- `user_data_dir` (default: auto-generated temp): Directory for browser profile data. Use `None` for incognito mode +- `profile_directory` (default: `'Default'`): Chrome profile subdirectory name (`'Profile 1'`, `'Work Profile'`, etc.) +- `storage_state`: Browser storage state (cookies, localStorage). Can be file path string or dict object +- `cookies_file`: **DEPRECATED** - Use `storage_state` instead + +## Network & Security +- `proxy`: Proxy configuration using `ProxySettings(server='http://host:8080', bypass='localhost,127.0.0.1', username='user', password='pass')` +- `permissions` (default: `['clipboardReadWrite', 'notifications']`): Browser permissions to grant. Use list like `['camera', 'microphone', 'geolocation']` +- `bypass_csp` (default: `False`): Bypass Content Security Policy (increases bot detection risk) +- `ignore_https_errors` (default: `False`): Ignore HTTPS certificate errors +- `extra_http_headers`: Additional HTTP headers sent with every request. Use dict like `{'Accept-Language': 'en-US', 'Custom-Header': 'value'}` +- `headers`: Additional HTTP headers for connect requests (remote browsers only) + +## Browser Launch +- `executable_path`: Path to browser executable for custom installations. Platform examples: + - macOS: `'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'` + - Windows: `'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'` + - Linux: `'/usr/bin/google-chrome'` +- `channel`: Browser channel (`'chromium'`, `'chrome'`, `'chrome-beta'`, `'msedge'`, etc.) +- `args`: Additional command-line arguments for the browser. Use list format: `['--disable-gpu', '--custom-flag=value', '--another-flag']` +- `env`: Environment variables for browser process. Use dict like `{'DISPLAY': ':0', 'LANG': 'en_US.UTF-8', 'CUSTOM_VAR': 'test'}` +- `chromium_sandbox` (default: `True` except in Docker): Enable Chromium sandboxing for security +- `devtools` (default: `False`): Open DevTools panel automatically (requires `headless=False`) +- `ignore_default_args`: List of default args to disable, or `True` to disable all. Use list like `['--enable-automation', '--disable-extensions']` + +## Timing & Performance +- `slow_mo` (default: `0.0`): Slow down actions by this many milliseconds +- `timeout` (default: `30000`): Default timeout for browser operations in milliseconds +- `default_timeout`: Default timeout for playwright calls in milliseconds +- `default_navigation_timeout`: Default timeout for page navigation in milliseconds +- `minimum_wait_page_load_time` (default: `0.25`): Minimum time to wait before capturing page state in seconds +- `wait_for_network_idle_page_load_time` (default: `0.5`): Time to wait for network activity to cease in seconds +- `maximum_wait_page_load_time` (default: `5.0`): Maximum time to wait for page load in seconds +- `wait_between_actions` (default: `0.5`): Time to wait between agent actions in seconds + +## AI Integration +- `highlight_elements` (default: `True`): Highlight interactive elements for AI vision +- `viewport_expansion` (default: `500`): Viewport expansion in pixels for AI context +- `include_dynamic_attributes` (default: `True`): Include dynamic attributes in selectors for better element identification + +## Downloads & Files +- `accept_downloads` (default: `True`): Automatically accept all downloads +- `downloads_path`: Directory for downloaded files. Use string like `'./downloads'` or `Path` object +- `auto_download_pdfs` (default: `True`): Automatically download PDFs instead of viewing in browser + +## Device Emulation +- `user_agent`: Custom user agent string. Example: `'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)'` +- `is_mobile` (default: `False`): Enable mobile viewport and touch events +- `has_touch` (default: `False`): Enable touch events for mobile emulation +- `locale`: User locale like `'en-GB'`, `'de-DE'`, `'ja-JP'` +- `timezone_id`: Timezone identifier like `'America/New_York'`, `'Europe/London'`, `'UTC'` +- `screen`: Screen size information, same format as `window_size` + +## Recording & Debugging +- `record_video_dir`: Directory to save video recordings as `.webm` files +- `record_har_path`: Path to save network trace files as `.har` format +- `traces_dir`: Directory to save complete Playwright trace files for debugging +- `record_har_content` (default: `'embed'`): HAR content mode (`'omit'`, `'embed'`, `'attach'`) +- `record_har_mode` (default: `'full'`): HAR recording mode (`'full'`, `'minimal'`) + +## Advanced Options +- `disable_security` (default: `False`): ⚠️ **NOT RECOMMENDED** - Disables all browser security features +- `deterministic_rendering` (default: `False`): ⚠️ **NOT RECOMMENDED** - Forces consistent rendering but reduces performance +- `java_script_enabled` (default: `True`): Enable/disable JavaScript execution +- `offline` (default: `False`): Start browser in offline mode +- `strict_selectors` (default: `False`): Use strict selector matching +- `base_url`: Base URL for relative navigation +- `service_workers` (default: `'allow'`): Service worker policy (`'allow'`, `'block'`) + +--- + +## Outdated BrowserProfile +For backward compatibility, you can pass all the parameters from above to the `BrowserProfile` and then to the `Browser`. +```python +from browser_use import BrowserProfile +profile = BrowserProfile(headless=False, stealth=True) +browser = Browser(browser_profile=profile) +``` + +## Browser vs BrowserSession + +`Browser` is an alias for `BrowserSession` - they are exactly the same class: +Use `Browser` for cleaner, more intuitive code. \ No newline at end of file diff --git a/docs/customize/browser-real-browser.mdx b/docs/customize/browser-real-browser.mdx new file mode 100644 index 000000000..904ff44f9 --- /dev/null +++ b/docs/customize/browser-real-browser.mdx @@ -0,0 +1,57 @@ +--- +title: "Real Browser" +description: "" +icon: "arrow-right-to-bracket" +--- + +Connect your existing Chrome browser to preserve authentication. + +## Basic Example + +```python +from browser_use import Agent, Browser, ChatOpenAI + +# Connect to your existing Chrome browser +browser = Browser( + executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + user_data_dir='~/Library/Application Support/Google/Chrome', + profile_directory='Default', +) + +agent = Agent( + task='Visit https://duckduckgo.com and search for "browser-use founders"', + browser=browser, + llm=ChatOpenAI(model='gpt-4.1-mini'), +) +async def main(): + await agent.run() +``` + +> **Note:** You need to fully close chrome before running this example. + +> **Note:** Google blocks this approach currently so we use DuckDuckGo instead. + + + +## How it Works + +1. **`executable_path`** - Path to your Chrome installation +2. **`user_data_dir`** - Your Chrome profile folder (keeps cookies, extensions, bookmarks) +3. **`profile_directory`** - Specific profile name (Default, Profile 1, etc.) + + +## Platform Paths + +```python +# macOS +executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' +user_data_dir='~/Library/Application Support/Google/Chrome' + +# Windows +executable_path='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' +user_data_dir='%LOCALAPPDATA%\\Google\\Chrome\\User Data' + +# Linux +executable_path='/usr/bin/google-chrome' +user_data_dir='~/.config/google-chrome' +``` \ No newline at end of file diff --git a/docs/customize/browser-remote.mdx b/docs/customize/browser-remote.mdx new file mode 100644 index 000000000..0408b4ae9 --- /dev/null +++ b/docs/customize/browser-remote.mdx @@ -0,0 +1,38 @@ +--- +title: "Remote Browser" +description: "" +icon: "cloud" +mode: "wide" +--- + + +### CDP URL Connection + +Get a cdp url from your favorite browser provider like AnchorBorwser, HyperBrowser, BrowserBase, Steel.dev, etc.: + +```python +from browser_use import Browser + +# Connect to remote browser +browser = Browser( + cdp_url="http://remote-server:9222", + is_local=False # Important: don't try to launch local browser +) +agent = Agent(task="", browser=browser) +``` + +### Proxy Connection + +```python +from browser_use.browser.profile import ProxySettings + +browser = Browser( + cdp_url="http://remote-server:9222", + proxy=ProxySettings( + server="http://proxy-server:8080", + username="proxy-user", + password="proxy-pass" + ), + is_local=False +) +``` diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx deleted file mode 100644 index dbec43905..000000000 --- a/docs/customize/browser-settings.mdx +++ /dev/null @@ -1,964 +0,0 @@ ---- -title: "Browser Settings" -description: "Launch or connect to an existing browser and configure it to your needs." -icon: "globe" ---- - -Browser Use uses [playwright](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context) (or [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)) to manage its connection with a real browser. - ---- - -**To launch or connect to a browser**, pass any playwright / browser-use configuration arguments you want to `BrowserSession(...)`: - -```python -from browser_use import BrowserSession, Agent - -browser_session = BrowserSession( - headless=True, - viewport={'width': 964, 'height': 647}, - user_data_dir='~/.config/browseruse/profiles/default', -) -agent = Agent('fill out the form on this page', browser_session=browser_session) -``` - - - The new `BrowserSession` & `BrowserProfile` accept all the same arguments that - Playwright's - [`launch_persistent_context(...)`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context) - takes, giving you full control over browser settings at launch. (see below for - the full list) - - ---- - -## `BrowserSession` - -- `BrowserSession(**params)` is Browser Use's object that tracks a connection to a running browser. It sets up: - - the `playwright`, `browser`, `browser_context`, and `page` objects and tracks which tabs the agent/human are focused on - - methods to interact with the browser window, apply config needed by the Agent, and run the `DOMService` for element detection - - it can take a `browser_profile=BrowserProfile(...)` template containing some config defaults, and `**kwargs` session-specific config overrides - -### Browser Connection Parameters - -Provide any one of these options to connect to an existing browser. These options are session-specific and cannot be stored in a `BrowserProfile(...)` template. - -#### `wss_url` - -```python -wss_url: str | None = None -``` - -WSS URL of the playwright-protocol browser server to connect to. See here for [WSS connection instructions](https://docs.browser-use.com/customize/real-browser#method-d%3A-connect-to-remote-playwright-node-js-browser-server-via-wss-url). - -#### `cdp_url` - -```python -cdp_url: str | None = None -``` - -CDP URL of the browser to connect to (e.g. `http://localhost:9222`). See here for [CDP connection instructions](https://docs.browser-use.com/customize/real-browser#method-e%3A-connect-to-remote-browser-via-cdp-url). - -#### `browser_pid` - -```python -browser_pid: int | None = None -``` - -PID of a running chromium-based browser process to connect to on localhost. See here for [connection via pid](https://docs.browser-use.com/customize/real-browser#method-c%3A-connect-to-local-browser-using-browser-pid) instructions. - - - For web scraping tasks on sites that restrict automated access, we recommend - using [our cloud](https://browser-use.com) or an external browser provider for - better reliability. See the [Connect to your Browser](/customize/real-browser) - guide for detailed connection instructions. - - -### Session-Specific Parameters - -#### `browser_profile` - -```python -browser_profile: BrowserProfile = BrowserProfile() -``` - -Optional `BrowserProfile` template containing default config to use for the `BrowserSession`. (see below for more info) - -#### `**kwargs` - -`BrowserSession` can also accept _all_ of the parameters [below](#browserprofile). -(the parameters _above_ this point are specific to `BrowserSession` and cannot be stored in a `BrowserProfile` template) - -Extra `**kwargs` passed to `BrowserSession(...)` act as session-specific overrides to the `BrowserProfile(...)` template. - -```python -base_iphone13 = BrowserProfile( - storage_state='/tmp/auth.json', # share cookies between parallel browsers - **playwright.devices['iPhone 13'], - timezone_id='UTC', -) -usa_phone = BrowserSession( - browser_profile=base_iphone13, - timezone_id='America/New_York', # kwargs override values in base_iphone13 -) -eu_phone = BrowserSession( - browser_profile=base_iphone13, - timezone_id='Europe/Paris', -) - -usa_agent = Agent(task='show me todays schedule...', browser_session=usa_phone) -eu_agent = Agent(task='show me todays schedule...', browser_session=eu_phone) -await asyncio.gather(agent1.run(), agent2.run()) -``` - ---- - -## `BrowserProfile` - -A `BrowserProfile` is a 📋 config template for a 🎭 `BrowserSession(...)`. - -It's basically just a typed + validated version of a `dict` to hold config. - -When you find yourself storing or re-using many browser configs, you can upgrade from: - -```diff -- config = {key: val, key: val, ...} -- BrowserSession(**config) -``` - -To this instead: - -```diff -+ config = BrowserProfile(key=val, key=val, ...) -+ BrowserSession(browser_profile=config) -``` - - -You don't ever *need* to use a `BrowserProfile`, you can always pass config parameters directly to `BrowserSession`: -```python -session = BrowserSession(headless=True, storage_state='auth.json', viewport={...}, ...) -``` - - -`BrowserProfile` is optional, but it provides a number of benefits over a normal `dict` for holding config: - -- has type hints and pydantic field descriptions that show up in your IDE -- validates config at runtime quickly without having to start a browser -- provides helper methods to autodetect screen size, set up local paths, save/load config as json, and more... - - -`BrowserProfiles`s are designed to easily be given 🆔 `uuid`s and put in a database + made editable by users. -`BrowserSession`s get their own 🆔 `uuid`s and be linked by 🖇 foreign key to whatever `BrowserProfiles` they use. - -This cleanly separates the per-connection rows from the bulky re-usable config and avoids wasting space in your db. -This is useful because a user may only have 2 or 3 profiles, but they could have 100k+ sessions within a few months. - - - -`BrowserProfile` and `BrowserSession` can both take any of the: - -- [Playwright parameters](#playwright) -- [Browser-Use parameters](#browser-use-parameters) (extra options we provide on top of `playwright`) - -The only parameters `BrowserProfile` can NOT take are the session-specific connection parameters and live playwright objects: -`cdp_url`, `wss_url`, `browser_pid`, `page`, `browser`, `browser_context`, `playwright`, etc. - -### Basic Example - -```python -from browser_use.browser import BrowserProfile - -profile = BrowserProfile( - stealth=True, - storage_state='/tmp/google_docs_cookies.json', - allowed_domains=['docs.google.com', 'https://accounts.google.com'], - viewport={'width': 396, 'height': 774}, - # ... playwright args / browser-use config args ... -) - -phone1 = BrowserSession(browser_profile=profile, device_scale_factor=1) -phone2 = BrowserSession(browser_profile=profile, device_scale_factor=2) -phone3 = BrowserSession(browser_profile=profile, device_scale_factor=3) -``` - -### Browser-Use Parameters - -These parameters control Browser Use-specific features, and are outside the standard playwright set. They can be passed to `BrowserSession(...)` and/or stored in a `BrowserProfile` template. - -#### `keep_alive` - -```python -keep_alive: bool | None = None -``` - -If `True` it wont close the browser after the first `agent.run()` ends. Useful for running multiple tasks with the same browser instance. If this is left as `None` and the Agent launched its own browser, the default is to close the browser after the agent completes. If the agent connected to an existing browser then it will leave it open. - -#### `stealth` - -```python -stealth: bool = False -``` - -Set to `True` to use [`patchright`](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) to avoid bot-blocking. (Might cause issues with some sites, requires manual testing.) - - - -#### `allowed_domains` - -```python -allowed_domains: list[str] | None = None -``` - -List of allowed domains for navigation. If None, all domains are allowed. -Example: `['google.com', '*.wikipedia.org']` - Here the agent will only be able to access `google.com` exactly and `wikipedia.org` + `*.wikipedia.org`. - -Glob patterns are supported: - -- `['example.com']` ✅ will match only `https://example.com/*` exactly, subdomains will not be allowed. - It's always the most secure to list all the domains you want to give the access to explicitly w/ schemes e.g. - `['https://google.com', 'http*://www.google.com', 'https://myaccount.google.com', 'https://mail.google.com', 'https://docs.google.com']` -- `['*.example.com']` ⚠️ **CAUTION** this will match `https://example.com` and _all_ its subdomains. - Make sure _all_ the subdomains are safe for the agent! `abc.example.com`, `def.example.com`, ..., `useruploads.example.com`, `admin.example.com` - -#### `disable_security` - -```python -disable_security: bool = False -``` - - - ⚠️ Setting this to `True` is NOT RECOMMENDED. It completely disables all basic - browser security features. - - -This option is for debugging and interacting across cross-origin iFrames when there are no cookies or sensitive data in use. -It's very INSECURE, under no circumstances should you enable this while using real cookies or sensitive data, visiting a single untrusted URL in this mode can immediately compromise all the profile cookies instantly. Consider a less nuclear option like `bypass_csp=True` instead. - -#### `deterministic_rendering` - -```python -deterministic_rendering: bool = False -``` - - - ⚠️ Setting this to `True` is NOT RECOMMENDED. It can be glitchy & slow, and it - increases chances of getting blocked by anti-bot systems. It's mostly useful - for QA applications. - - -It's a shortcut for adding these launch args: - -- `--deterministic-mode` -- `--js-flags=--random-seed=1157259159` -- `--force-color-profile=srgb` -- `--font-render-hinting=none` -- `--force-device-scale-factor=2` -- `--enable-webgl` - -With these options fonts will look slightly worse than macOS and slightly than Windows, but rendering will be more consistent between OSs and runs. The cost is performance and stability. Software rendering is slower, easier to fingerprint as a bot, and sometimes glitchy. You likely _don't need this option_ unless you're trying to do screenshot diffing. - -#### `highlight_elements` - -```python -highlight_elements: bool = True -``` - -Highlight interactive elements on the screen with colorful bounding boxes. - -#### `viewport_expansion` - -```python -viewport_expansion: int = 500 -``` - -Viewport expansion in pixels. With this you can control how much of the page is included in the context of the LLM: - -- `-1`: All elements from the entire page will be included, regardless of visibility (highest token usage but most complete). -- `0`: Only elements which are currently visible in the viewport will be included. -- `500` (default): Elements in the viewport plus an additional 500 pixels in each direction will be included, providing a balance between context and token usage. - -#### `include_dynamic_attributes` - -```python -include_dynamic_attributes: bool = True -``` - -Include dynamic attributes in selectors for better element targeting. - -#### `minimum_wait_page_load_time` - -```python -minimum_wait_page_load_time: float = 0.25 -``` - -Minimum time to wait before capturing page state for LLM input. - -#### `wait_for_network_idle_page_load_time` - -```python -wait_for_network_idle_page_load_time: float = 0.5 -``` - -Time to wait for network activity to cease. Increase to 3-5s for slower websites. This tracks essential content loading, not dynamic elements like videos. - -#### `maximum_wait_page_load_time` - -```python -maximum_wait_page_load_time: float = 5.0 -``` - -Maximum time to wait for page load before proceeding. - -#### `wait_between_actions` - -```python -wait_between_actions: float = 0.5 -``` - -Time to wait between agent actions. - -#### `cookies_file` - -```python -cookies_file: str | None = None -``` - -JSON file path to save cookies to. - - -This option is DEPRECATED. Use [`storage_state`](#storage-state) instead, it's the standard playwright format and also supports `localStorage` and `indexedDB`! - -The library will automatically save a new `storage_state.json` next to any `cookies_file` path you provide, just use `storage_state='path/to/storage_state.json' to switch to the new format: - -`cookies_file.json`: `[{cookie}, {cookie}, {cookie}]` -⬇️ -`storage_state.json`: `{"cookies": [{cookie}, {cookie}, {cookie}], "origins": {... optional localstorage state ...}}` - -Or run `playwright open https://example.com/ --save-storage=storage_state.json` and log into any sites you need to generate a fresh storage state file. - - - -#### `profile_directory` - -```python -profile_directory: str = 'Default' -``` - -Chrome profile subdirectory name inside of your `user_data_dir` (e.g. `Default`, `Profile 1`, `Work`, etc.). -No need to set this unless you have multiple profiles set up in a single `user_data_dir` and need to use a specific one. - -#### `window_position` - -```python -window_position: dict | None = {"width": 0, "height": 0} -``` - -Window position from top-left corner. - -#### `save_recording_path` - -```python -save_recording_path: str | None = None -``` - -Directory path for saving video recordings. - -#### `trace_path` - -```python -trace_path: str | None = None -``` - -Directory path for saving Agent trace files. Files are automatically named as `{trace_path}/{context_id}.zip`. - ---- - - - - -### Playwright Launch Options - -All the parameters below are standard playwright parameters and can be passed to both `BrowserSession` and `BrowserProfile`. -They are defined in `browser_use/browser/profile.py`. See here for the [official Playwright documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context) for all of these options. - -#### `headless` - -```python -headless: bool | None = None -``` - -Runs the browser without a visible UI. If None, auto-detects based on display availability. If you set `headless=False` on a server with no monitor attached, the browser will fail to launch (use `xvfb` + vnc to give a headless server a virtual display you can remote control). - -`headless=False` is recommended for maximum stealth and is required for human-in-the-loop workflows. - -#### `channel` - -```python -channel: BrowserChannel = 'chromium' -``` - -Browser channel: `['chromium']` (default when `stealth=False`), `'chrome'` (default when `stealth=True`), `'chrome-beta'`, `'chrome-dev'`, `'chrome-canary'`, `'msedge'`, `'msedge-beta'`, `'msedge-dev'`, `'msedge-canary'` - -Don't worry, other chromium-based browsers not in this list (e.g. `brave`) are still supported if you provide your own [`executable_path`](#executable_path), just set it to `chromium` for those. - -#### `executable_path` - -```python -executable_path: str | Path | None = None -``` - -Path to browser executable for custom installations. - -#### `user_data_dir` - -```python -user_data_dir: str | Path | None = '~/.config/browseruse/profiles/default' -``` - -Directory for browser profile data. Set to `None` to use an ephemeral temporary profile (aka incognito mode). - -Multiple running browsers **cannot share a single `user_data_dir` at the same time**. You must set it to `None` or -provide a unique `user_data_dir` per-session if you plan to run multiple browsers. - -The browser version run must always be equal to or greater than the version used to create the `user_data_dir`. -If you see errors like `Failed to parse Extensions` or similar and failures when launching, you're attempting to run an older browser with an incompatible `user_data_dir` that's already been migrated to a newer schema version. - -#### `args` - -```python -args: list[str] = [] -``` - -Additional command-line arguments to pass to the browser. See here for the [full list of available chrome launch options](https://peter.sh/experiments/chromium-command-line-switches/). - -#### `ignore_default_args` - -```python -ignore_default_args: list[str] | bool = ['--enable-automation', '--disable-extensions'] -``` - -List of default CLI args to stop playwright from including when launching chrome. Set it to `True` to disable _all_ default options (not recommended). - -#### `env` - -```python -env: dict[str, str] = {} -``` - -Extra environment variables to set when launching browser. e.g. `{'DISPLAY': '1'}` to use a specific X11 display. - -#### `chromium_sandbox` - -```python -chromium_sandbox: bool = not IN_DOCKER -``` - -Whether to enable Chromium sandboxing (recommended for security). Should always be `False` when running inside Docker -because Docker provides its own sandboxing can conflict with Chrome's. - -#### `devtools` - -```python -devtools: bool = False -``` - -Whether to open DevTools panel automatically (only works when `headless=False`). - -#### `slow_mo` - -```python -slow_mo: float = 0 -``` - -Slow down actions by this many milliseconds. - -#### `timeout` - -```python -timeout: float = 30000 -``` - -Default timeout in milliseconds for connecting to a remote browser. - -#### `accept_downloads` - -```python -accept_downloads: bool = True -``` - -Whether to automatically accept all downloads. - -#### `proxy` - -```python -proxy: ProxySettings | None = None -``` - -Proxy settings (typed). Example: - -```python -proxy=ProxySettings(server="http://proxy.com:8080", username="user", password="pass") -``` - -#### `permissions` - -```python -permissions: list[str] = ['clipboard-read', 'clipboard-write', 'notifications'] -``` - -Browser permissions to grant. See here for the [full list of available permission](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-grant-permissions). - -#### `storage_state` - -```python -storage_state: str | Path | dict | None = None -``` - -Browser storage state (cookies, localStorage). Can be file path or dict. See here for the [Playwright `storage_state` documentation](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-storage-state) on how to use it. -This option is only applied when launching a new browser using the default builtin playwright chromium and `user_data_dir=None` is set. - -```bash -# to create a storage state file, run the following and log into the sites you need once the browser opens: -playwright open https://example.com/ --save-storage=./storage_state.json -# then setup a BrowserSession with storage_state='./storage_state.json' and user_data_dir=None to use it -``` - -### Playwright Timing Settings - -These control how the browser waits for CDP API calls to complete and pages to load. - -#### `default_timeout` - -```python -default_timeout: float | None = None -``` - -Default timeout for Playwright operations in milliseconds (e.g. `10000` if you want 10s). - -#### `default_navigation_timeout` - -```python -default_navigation_timeout: float | None = None -``` - -Default timeout for page navigation in milliseconds (e.g. `30000` if you want 30s). - -### Playwright Viewport Options - -Configure browser window size, viewport, and display properties: - -#### `user_agent` - -```python -user_agent: str | None = None -``` - -Specific user agent to use in this context. See [`playwright.devices`](https://playwright.dev/python/docs/emulation). - -#### `is_mobile` - -```python -is_mobile: bool = False -``` - -Whether the meta viewport tag is taken into account and touch events are enabled. - -#### `has_touch` - -```python -has_touch: bool = False -``` - -Specifies if viewport supports touch events. - -#### `geolocation` - -```python -geolocation: dict | None = None -``` - -Geolocation coordinates. Example: `{"latitude": 59.95, "longitude": 30.31667}` - -#### `locale` - -```python -locale: str | None = None -``` - -Specify user locale, for example `en-GB`, `de-DE`, etc. Locale will affect the `navigator.language` value, `Accept-Language` request header value as well as number and date formatting rules. - -#### `timezone_id` - -```python -timezone_id: str | None = None -``` - -Timezone identifier (e.g. `'America/New_York'` or `'UTC'`). - -#### `window_size` - -```python -window_size: dict | None = None -``` - -Browser window size for headful mode. Example: `{"width": 1920, "height": 1080}` - -#### `viewport` - -```python -viewport: dict | None = None -``` - -Viewport size with `width` and `height`. Example: `{"width": 1280, "height": 720}` - -#### `no_viewport` - -```python -no_viewport: bool | None = not headless -``` - -Disable fixed viewport. Content will resize with window. - -_Tip:_ don't use this parameter, it's a playwright standard parameter but it's redundant and only serves to override the `viewport` setting above. -A viewport is _always_ used in headless mode regardless of this setting, and is _never_ used in headful mode unless you pass `viewport={width, height}` explicitly. - -#### `device_scale_factor` - -```python -device_scale_factor: float | None = None -``` - -Device scale factor (DPI). Useful for high-resolution screenshots (set it to 2 or 3). - -#### `screen` - -```python -screen: dict | None = None -``` - -Screen size available to browser. Auto-detected if not specified. - -#### `color_scheme` - -```python -color_scheme: ColorScheme = 'light' -``` - -Preferred color scheme: `'light'`, `'dark'`, `'no-preference'` - -#### `contrast` - -```python -contrast: Contrast = 'no-preference' -``` - -Contrast preference: `'no-preference'`, `'more'`, `'null'` - -#### `reduced_motion` - -```python -reduced_motion: ReducedMotion = 'no-preference' -``` - -Reduced motion preference: `'reduce'`, `'no-preference'`, `'null'` - -#### `forced_colors` - -```python -forced_colors: ForcedColors = 'none' -``` - -Forced colors mode: `'active'`, `'none'`, `'null'` - -#### `**playwright.devices[...]` - -Playwright provides launch & context arg presets to [emulate common device fingerprints](https://playwright.dev/python/docs/emulation). - -```python -BrowserProfile( - ... - **playwright.devices['iPhone 13'], # playwright = await async_playwright().start() -) -``` - -Because `BrowserSession` and `BrowserProfile` take all the standard playwright args, we are able to support these device presets as well. - -### Playwright Security Options - -> See `allowed_domains` above too! - -#### `offline` - -```python -offline: bool = False -``` - -Emulate network being offline. - -#### `http_credentials` - -```python -http_credentials: dict | None = None -``` - -Credentials for HTTP authentication. - -#### `extra_http_headers` - -```python -extra_http_headers: dict[str, str] = {} -``` - -Additional HTTP headers to be sent with every request. - -#### `ignore_https_errors` - -```python -ignore_https_errors: bool = False -``` - -Whether to ignore HTTPS errors when sending network requests. - -#### `bypass_csp` - -```python -bypass_csp: bool = False -``` - - - Enabling this can increase security risk and makes the bot very easy to - fingerprint. (Cloudflare, Datadome, etc. will block you) - - -Toggles bypassing Content-Security-Policy. Enabling reduces some CSP-related errors that can arise from automation scripts injected into pages with strict policies that forbid inline scripts. - -#### `java_script_enabled` - -```python -java_script_enabled: bool = True -``` - - - Not recommended, untested with Browser Use and likely breaks things. - - -Whether or not to enable JavaScript in the context. - -#### `service_workers` - -```python -service_workers: ServiceWorkers = 'allow' -``` - -Whether to allow sites to register Service workers: `'allow'`, `'block'` - -#### `base_url` - -```python -base_url: str | None = None -``` - -Base URL to be used in `page.goto()` and similar operations. - -#### `strict_selectors` - -```python -strict_selectors: bool = False -``` - -If true, selector passed to Playwright methods will throw if more than one element matches. - -#### `client_certificates` - -```python -client_certificates: list[ClientCertificate] = [] -``` - -Client certificates to be used with requests. - -### Playwright Recording Options - -Note: Browser Use also provides some of our own recording-related options not listed below (see above). - -#### `record_video_dir` - - - - -```python -record_video_dir: str | Path | None = None -``` - -Directory to save `.webm` video recordings. [Playwright Docs: `record_video_dir`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context-option-record-video-dir) - - - This parameter also has an alias `save_recording_path` for backwards - compatibility with past versions, but we recommend using the standard - Playwright name `record_video_dir` going forward. - - -#### `record_video_size` - -```python -record_video_size: dict | None = None. [Playwright Docs: `record_video_size`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context-option-record-video-size) - -``` - -Video size. Example: `{"width": 1280, "height": 720}` - -#### `record_har_path` - - - - -```python -record_har_path: str | Path | None = None -``` - -Path to save `.har` network trace files. [Playwright Docs: `record_har_path`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context-option-record-har-path) - - - This parameter also has an alias `save_har_path` for backwards compatibility - with past versions, but we recommend using the standard Playwright name - `record_har_path` going forward. - - -#### `record_har_content` - -```python -record_har_content: RecordHarContent = 'embed' -``` - -How to persist HAR content: `'omit'`, `'embed'`, `'attach'` - -#### `record_har_mode` - -```python -record_har_mode: RecordHarMode = 'full' -``` - -HAR recording mode: `'full'`, `'minimal'` - -#### `record_har_omit_content` - -```python -record_har_omit_content: bool = False -``` - -Whether to omit request content from the HAR. - -#### `record_har_url_filter` - -```python -record_har_url_filter: str | Pattern | None = None -``` - -URL filter for HAR recording. - -#### `downloads_path` - -```python -downloads_path: str | Path | None = '~/.config/browseruse/downloads' -``` - -(aliases: `downloads_dir`, `save_downloads_path`) - -Local filesystem directory to save browser file downloads to. - -#### `traces_dir` - - - - -```python -traces_dir: str | Path | None = None -``` - -Directory to save all-in-one trace files. Files are automatically named as `{traces_dir}/{context_id}.zip`. [Playwright Docs: `traces_dir`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context-option-traces-dir) - - - This parameter also has an alias `trace_path` for backwards compatibility with - past versions, but we recommend using the standard Playwright name - `traces_dir` going forward. - - -#### `handle_sighup` - -```python -handle_sighup: bool = True -``` - -Whether playwright should swallow SIGHUP signals and kill the browser. - -#### `handle_sigint` - -```python -handle_sigint: bool = False -``` - -Whether playwright should swallow SIGINT signals and kill the browser. - -#### `handle_sigterm` - -```python -handle_sigterm: bool = False -``` - -Whether playwright should swallow SIGTERM signals and kill the browser. - ---- - -## Full Example - -```python -from browser_use import BrowserSession, BrowserProfile, Agent - -browser_profile = BrowserProfile( - headless=False, - storage_state="path/to/storage_state.json", - wait_for_network_idle_page_load_time=3.0, - viewport={"width": 1280, "height": 1100}, - locale='en-US', - user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', - highlight_elements=True, - viewport_expansion=500, - allowed_domains=['*.google.com', 'http*://*.wikipedia.org'], - user_data_dir=None, -) - -browser_session = BrowserSession( - browser_profile=browser_profile, - headless=True, # extra kwargs to the session override the defaults in the profile -) - -# you can drive a session without the agent / reuse it between agents -await browser_session.start() -page = await browser_session.get_current_page() -await page.goto('https://example.com/first/page') - -async def run_search(): - agent = Agent( - task='Your task', - llm=llm, - page=page, # optional: pass a specific playwright page to start on - browser_session=browser_session, # optional: pass an existing browser session to an agent - ) -``` - ---- - -## Summary - -- **BrowserSession** (defined in `browser_use/browser/session.py`) handles the live browser connection and runtime state -- **BrowserProfile** (defined in `browser_use/browser/profile.py`) is a template that can store default config parameters for a `BrowserSession(...)` - -Configuration parameters defined in both scopes consumed by these calls depending on whether we're connecting/launching: - -- `BrowserConnectArgs` - args for `playwright.BrowserType.connect_over_cdp(...)` -- `BrowserLaunchArgs` - args for `playwright.BrowserType.launch(...)` -- `BrowserNewContextArgs` - args for `playwright.BrowserType.new_context(...)` -- `BrowserLaunchPersistentContextArgs` - args for `playwright.BrowserType.launch_persistent_context(...)` -- Browser Use's own internal methods - -For more details on Playwright's browser context options, see their [launch args documentation](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context). - ---- diff --git a/docs/customize/chain-agents.mdx b/docs/customize/chain-agents.mdx new file mode 100644 index 000000000..ff5e3f6a6 --- /dev/null +++ b/docs/customize/chain-agents.mdx @@ -0,0 +1,45 @@ +--- +title: "Chain Agents" +description: "Chain multiple tasks together with the same agent and browser session." +icon: "link" +mode: "wide" +--- + +## Chain Agent Tasks + +Keep your browser session alive and chain multiple tasks together. Perfect for conversational workflows or multi-step processes. + +```python +import asyncio +from dotenv import load_dotenv +load_dotenv() + +from browser_use import Agent, BrowserProfile + +profile = BrowserProfile(keep_alive=True) + +async def main(): + agent = Agent(task="Go to reddit.com", browser_profile=profile) + await agent.run(max_steps=1) + + while True: + user_response = input('\n👤 New task or "q" to quit: ') + if user_response.lower() == 'q': + break + agent.add_new_task(f'New task: {user_response}') + await agent.run() + +if __name__ == '__main__': + asyncio.run(main()) +``` + +## How It Works + +1. **Persistent Browser**: `BrowserProfile(keep_alive=True)` prevents browser from closing between tasks +2. **Task Chaining**: Use `agent.add_new_task()` to add follow-up tasks +3. **Context Preservation**: Agent maintains memory and browser state across tasks +4. **Interactive Flow**: Perfect for conversational interfaces or complex workflows + + +The browser session remains active throughout the entire chain, preserving all cookies, local storage, and page state. + \ No newline at end of file diff --git a/docs/customize/custom-functions.mdx b/docs/customize/custom-functions.mdx index 33abbb483..850da3bfa 100644 --- a/docs/customize/custom-functions.mdx +++ b/docs/customize/custom-functions.mdx @@ -1,7 +1,8 @@ --- -title: "Custom Functions" +title: "Tools" description: "Extend default agent and write custom action functions to do certain tasks" -icon: "function" +icon: "wrench" +mode: "wide" --- Custom actions are functions *you* provide, that are added to our [default actions](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py) the agent can use to accomplish tasks. @@ -59,10 +60,18 @@ When the LLM calls an action, it sees its argument names & types, and will provi ```python @controller.action('Click element') -def click_element(css_selector: str, page: Page) -> ActionResult: +async def click_element(css_selector: str, browser_session: Browser) -> ActionResult: # css_selector is an action param the LLM must provide when calling - # page is a special framework-provided param to access the browser APIs (see below) - await page.locator(css_selector).click() + # browser_session is a special framework-provided param to access the browser APIs (see below) + + # Get the current CDP session to interact with the browser + cdp_session = await browser_session.get_or_create_cdp_session() + + # Use CDP to evaluate JavaScript and click the element + await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': f'document.querySelector("{css_selector}").click()'}, + session_id=cdp_session.session_id, + ) return ActionResult(extracted_content=f"Clicked element {css_selector}") ``` @@ -89,12 +98,27 @@ class MyParams(BaseModel): field4: str = Field(default='abc', description='Detailed description for the LLM') @controller.action('My action', param_model=MyParams) -def my_action(params: MyParams, page: Page) -> ActionResult: - await page.keyboard.type(params.field2) - return ActionResult(extracted_content=f"Inputted {params} on {page.url}") +async def my_action(params: MyParams, browser_session: Browser) -> ActionResult: + # Get the current CDP session to interact with the browser + cdp_session = await browser_session.get_or_create_cdp_session() + + # Use CDP to type text + await cdp_session.cdp_client.send.Input.insertText( + params={'text': params.field2}, + session_id=cdp_session.session_id, + ) + + # Get current URL using CDP + result = await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': 'window.location.href', 'returnByValue': True}, + session_id=cdp_session.session_id, + ) + current_url = result.get('result', {}).get('value', 'unknown') + + return ActionResult(extracted_content=f"Inputted {params} on {current_url}") ``` -Any special framework-provided arguments (e.g. `page`) will be passed as separate positional arguments after `params`. +Any special framework-provided arguments (e.g. `browser_session`) will be passed as separate positional arguments after `params`. To use a `BaseModel` the arg *must* be called `params`. Action function args are matched and filled like named arguments; arg order doesn't matter but names and types do. @@ -104,47 +128,134 @@ To use a `BaseModel` the arg *must* be called `params`. Action function args are These special action parameters are injected by the `Controller` and are passed as extra args to any actions that expect them. -For example, actions that need to run playwright code to interact with the browser should take the argument `page` or `browser_session`. +For example, actions that need to interact with the browser should take the `browser_session` argument. -- `page: Page` - The current Playwright page (shortcut for `browser_session.get_current_page()`) -- `browser_session: BrowserSession` - The current browser session (and playwright context via `browser_session.browser_context`) +- `browser_session: Browser` - The current browser session with access to CDP for browser interaction - `context: AgentContext` - Any optional top-level context object passed to the Agent, e.g. `Agent(context=user_provided_obj)` - `page_extraction_llm: BaseChatModel` - LLM instance used for page content extraction - `available_file_paths: list[str]` - List of available file paths for upload / processing - `has_sensitive_data: bool` - Whether the action content contains sensitive data markers (check this to avoid logging sensitive data to terminal by accident) -#### Example: Action uses the current `page` + +Browser Use has moved from Playwright to Chrome DevTools Protocol (CDP) for browser interaction. The `browser_session` provides access to CDP through `browser_session.agent_focus.cdp_client` or `await browser_session.get_or_create_cdp_session()`. Playwright is only used internally to install the browser binary, but all browser interaction is done via CDP. + + +### Understanding the Browser Session Context + +The `Browser` object provides multiple ways to interact with the browser: + +#### 1. Direct CDP Access +```python +# Get the current CDP session +cdp_session = await browser_session.get_or_create_cdp_session() + +# Execute JavaScript +result = await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': 'document.title', 'returnByValue': True}, + session_id=cdp_session.session_id, +) + +# Click at coordinates +await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mousePressed', + 'x': 100, + 'y': 200, + 'button': 'left', + 'clickCount': 1, + }, + session_id=cdp_session.session_id, +) +await cdp_session.cdp_client.send.Input.dispatchMouseEvent( + params={ + 'type': 'mouseReleased', + 'x': 100, + 'y': 200, + 'button': 'left', + }, + session_id=cdp_session.session_id, +) +``` + +#### 2. Event-Based Actions +```python +from browser_use.browser.events import ClickElementEvent, TypeTextEvent, NavigateToUrlEvent + +# Get a DOM element first +element = await browser_session.get_dom_element_by_index(5) + +# Dispatch events through the event bus +click_event = browser_session.event_bus.dispatch(ClickElementEvent(node=element)) +await click_event + +type_event = browser_session.event_bus.dispatch(TypeTextEvent(node=element, text="Hello")) +await type_event + +navigate_event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url="https://example.com")) +await navigate_event +``` + +#### 3. High-Level Browser Session Methods +```python +# Get current page information +state = await browser_session.get_browser_state_summary() +print(f"Current URL: {state.url}") +print(f"Page title: {state.title}") + +# Take a screenshot +screenshot_path = await browser_session.take_screenshot() + +# Get page HTML +html = await browser_session.get_page_html() + +# Get all open tabs +tabs = await browser_session.get_tabs() +``` + +#### Example: Action uses the current browser session ```python -from browser_use.browser.types import Page -from browser_use import Controller, ActionResult +from browser_use import Browser, Controller, ActionResult controller = Controller() @controller.action('Type keyboard input into a page') -async def input_text_into_page(text: str, page: Page) -> ActionResult: - await page.keyboard.type(text) - return ActionResult(extracted_content='Website opened') +async def input_text_into_page(text: str, browser_session: Browser) -> ActionResult: + # Get the current CDP session to interact with the browser + cdp_session = await browser_session.get_or_create_cdp_session() + + # Use CDP to type text + await cdp_session.cdp_client.send.Input.insertText( + params={'text': text}, + session_id=cdp_session.session_id, + ) + return ActionResult(extracted_content='Text input completed') ``` -#### Example: Action uses the `browser_context` +#### Example: Action uses browser session for tab management ```python -from browser_use import BrowserSession, Controller, ActionResult +from browser_use import Browser, Controller, ActionResult +from browser_use.browser.events import NavigateToUrlEvent, SwitchTabEvent controller = Controller() @controller.action('Open website') -async def open_website(url: str, browser_session: BrowserSession) -> ActionResult: - # find matching existing tab by looking through all pages in playwright browser_context - all_tabs = await browser_session.browser_context.pages - for tab in all_tabs: +async def open_website(url: str, browser_session: Browser) -> ActionResult: + # Get all open tabs + tabs = await browser_session.get_tabs() + + # Check if URL is already open in any tab + for tab in tabs: if tab.url == url: - await tab.bring_to_foreground() - return ActionResult(extracted_content=f'Switched to tab with url {url}') - # otherwise, create a new tab - new_tab = await browser_session.browser_context.new_page() - await new_tab.goto(url) + # Switch to existing tab using events + switch_event = browser_session.event_bus.dispatch(SwitchTabEvent(target_id=tab.target_id)) + await switch_event + return ActionResult(extracted_content=f'Switched to existing tab with url {url}') + + # Otherwise, open URL in a new tab using events + navigate_event = browser_session.event_bus.dispatch(NavigateToUrlEvent(url=url, new_tab=True)) + await navigate_event return ActionResult(extracted_content=f'Opened new tab with url {url}') ``` @@ -155,15 +266,15 @@ async def open_website(url: str, browser_session: BrowserSession) -> ActionResul ## Important Rules 1. **Return an [`ActionResult`](https://github.com/search?q=repo%3Abrowser-use%2Fbrowser-use+%22class+ActionResult%28BaseModel%29%22&type=code)**: All actions should return an `ActionResult | str | None`. The stringified version of the result is passed back to the LLM, and optionally persisted in the long-term memory when `ActionResult(..., include_in_memory=True)`. -2. **Type hints on arguments are required**: They are used to verify that action params don't conflict with special arguments injected by the controller (e.g. `page`) +2. **Type hints on arguments are required**: They are used to verify that action params don't conflict with special arguments injected by the controller (e.g. `browser_session`) 3. **Actions functions called directly must be passed kwargs**: When calling actions from other actions or python code, you must **pass all parameters as kwargs only**, even though the actions are usually defined using positional args (for the same reasons as [pluggy](https://pluggy.readthedocs.io/en/stable/index.html#calling-hooks)). Action arguments are always matched by name and type, **not** positional order, so this helps prevent ambiguity / reordering issues while keeping action signatures short. ```python @controller.action('Fill in the country form field') - def input_country_field(country: str, page: Page) -> ActionResult: - await some_action(123, page=page) # ❌ not allowed: positional args, use kwarg syntax when calling - await some_action(abc=123, page=page) # ✅ allowed: action params & special kwargs - await some_other_action(params=OtherAction(abc=123), page=page) # ✅ allowed: params=model & special kwargs + async def input_country_field(country: str, browser_session: Browser) -> ActionResult: + await some_action(123, browser_session=browser_session) # ❌ not allowed: positional args, use kwarg syntax when calling + await some_action(abc=123, browser_session=browser_session) # ✅ allowed: action params & special kwargs + await some_other_action(params=OtherAction(abc=123), browser_session=browser_session) # ✅ allowed: params=model & special kwargs ``` ```python @@ -173,12 +284,12 @@ class PinCodeParams(BaseModel): retries: int = 3 # ✅ supports optional/defaults @controller.action('...', param_model=PinCodeParams) -async def input_pin_code(params: PinCodeParams, page: Page): ... # ✅ special params at the end +async def input_pin_code(params: PinCodeParams, browser_session: Browser): ... # ✅ special params at the end # Using function arguments to define action params -async def input_pin_code(code: int, retries: int, page: Page): ... # ✅ params first, special params second, no defaults +async def input_pin_code(code: int, retries: int, browser_session: Browser): ... # ✅ params first, special params second, no defaults async def input_pin_code(code: int, retries: int=3): ... # ✅ defaults ok only if no special params needed -async def input_pin_code(code: int, retries: int=3, page: Page): ... # ❌ Python SyntaxError! not allowed +async def input_pin_code(code: int, retries: int=3, browser_session: Browser): ... # ❌ Python SyntaxError! not allowed ``` @@ -228,23 +339,8 @@ agent = Agent(controller=controller, ...) ``` -If you want actions to only be available on certain pages, and to not tell the LLM about them on other pages, - you can use the `allowed_domains` and `page_filter`: -```python -from pydantic import BaseModel -from browser_use import Controller, ActionResult - -controller = Controller() - -async def is_ai_allowed(page: Page): - if api.some_service.check_url(page.url): - logger.warning('Allowing AI agent to visit url:', page.url) - return True - return False - -@controller.action('Fill out secret_form', allowed_domains=['https://*.example.com'], page_filter=is_ai_allowed) +@controller.action('Fill out secret_form', allowed_domains=['https://*.example.com']) def fill_out_form(...) -> ActionResult: - ... will only be runnable by LLM on pages that match https://*.example.com *AND* where is_ai_allowed(page) returns True - + ... will only be runnable by LLM on pages that match https://*.example.com ``` diff --git a/docs/customize/fast-agent.mdx b/docs/customize/fast-agent.mdx new file mode 100644 index 000000000..ef1946915 --- /dev/null +++ b/docs/customize/fast-agent.mdx @@ -0,0 +1,97 @@ +--- +title: "Fast Agent" +description: "Optimize agent performance for maximum speed and efficiency." +icon: "bolt" +mode: "wide" +--- + +```python +import asyncio +from dotenv import load_dotenv +load_dotenv() + +from browser_use import Agent, BrowserProfile + +# Speed optimization instructions for the model +SPEED_OPTIMIZATION_PROMPT = """ +Speed optimization instructions: +- Be extremely concise and direct in your responses +- Get to the goal as quickly as possible +- Use multi-action sequences whenever possible to reduce steps +""" + + +async def main(): + # 1. Use fast LLM - Llama 4 on Groq for ultra-fast inference + from browser_use import ChatGroq + + llm = ChatGroq( + model='meta-llama/llama-4-maverick-17b-128e-instruct', + temperature=0.0, + ) + # from browser_use import ChatGoogle + + # llm = ChatGoogle(model='gemini-2.5-flash') + + # 2. Create speed-optimized browser profile + browser_profile = BrowserProfile( + minimum_wait_page_load_time=0.1, + wait_between_actions=0.1, + headless=False, + ) + + # 3. Define a speed-focused task + task = """ + 1. Go to reddit https://www.reddit.com/search/?q=browser+agent&type=communities + 2. Click directly on the first 5 communities to open each in new tabs + 3. Find out what the latest post is about, and switch directly to the next tab + 4. Return the latest post summary for each page + """ + + # 4. Create agent with all speed optimizations + agent = Agent( + task=task, + llm=llm, + flash_mode=True, # Disables thinking in the LLM output for maximum speed + browser_profile=browser_profile, + extend_system_message=SPEED_OPTIMIZATION_PROMPT, + ) + + await agent.run() + + +if __name__ == '__main__': + asyncio.run(main()) +``` + +## Speed Optimization Techniques + +### 1. Fast LLM Models +```python +# Groq - Ultra-fast inference +from browser_use import ChatGroq +llm = ChatGroq(model='meta-llama/llama-4-maverick-17b-128e-instruct') + +# Google Gemini Flash - Optimized for speed +from browser_use import ChatGoogle +llm = ChatGoogle(model='gemini-2.5-flash') +``` + +### 2. Browser Optimizations +```python +browser_profile = BrowserProfile( + minimum_wait_page_load_time=0.1, # Reduce wait time + wait_between_actions=0.1, # Faster action execution + headless=True, # No GUI overhead +) +``` + +### 3. Agent Optimizations +```python +agent = Agent( + task=task, + llm=llm, + flash_mode=True, # Skip LLM thinking process + extend_system_message=SPEED_PROMPT, # Optimize LLM behavior +) +``` diff --git a/docs/customize/hooks.mdx b/docs/customize/hooks.mdx index b3091a050..ede04c056 100644 --- a/docs/customize/hooks.mdx +++ b/docs/customize/hooks.mdx @@ -3,6 +3,7 @@ title: "Lifecycle Hooks" description: "Customize agent behavior with lifecycle hooks" icon: "Wrench" author: "Carlos A. Planchón" +mode: "wide" --- Browser-Use provides lifecycle hooks that allow you to execute custom code at specific points during the agent's execution. @@ -35,11 +36,10 @@ async def my_step_hook(agent: Agent): # agent.controller, agent.llm, agent.browser_session # agent.pause(), agent.resume(), agent.add_new_task(...), etc. - # You also have direct access to the playwright Page and Browser Context - page = await agent.browser_session.get_current_page() - # https://playwright.dev/python/docs/api/class-page - - current_url = page.url + # You also have direct access to the browser state + state = await agent.browser_session.get_browser_state_summary() + + current_url = state.url visit_log = agent.history.urls() previous_url = visit_log[-2] if len(visit_log) >= 2 else None print(f"Agent was last on URL: {previous_url} and is now on {current_url}") @@ -68,7 +68,7 @@ async def my_step_hook(agent: Agent): agent = Agent( task="Search for the latest news about AI", - llm=ChatOpenAI(model="gpt-4o"), + llm=ChatOpenAI(model="gpt-4.1-mini"), ) await agent.run( @@ -96,10 +96,10 @@ When working with agent hooks, you have access to the entire `Agent` instance. H - `agent.history.model_actions()`: Actions taken by the agent - `agent.history.extracted_content()`: Content extracted from web pages - `agent.history.urls()`: URLs visited by the agent -- `agent.browser_session` gives direct access to the `BrowserSession()` and playwright objects - - `agent.browser_session.get_current_page()`: Get the current playwright `Page` object the agent is focused on - - `agent.browser_session.browser_context`: Get the current playwright `BrowserContext` object - - `agent.browser_session.browser_context.pages`: Get all the tabs currently open in the context +- `agent.browser_session` gives direct access to the `Browser()` and CDP interface + - `agent.browser_session.agent_focus`: Get the current CDP session the agent is focused on + - `agent.browser_session.get_or_create_cdp_session()`: Get the current CDP session for browser interaction + - `agent.browser_session.get_tabs()`: Get all tabs currently open - `agent.browser_session.get_page_html()`: Current page HTML - `agent.browser_session.take_screenshot()`: Screenshot of the current page @@ -337,7 +337,7 @@ async def run_agent(): """Run the Browser-Use agent with the recording hook""" agent = Agent( task="Compare the price of gpt-4o and DeepSeek-V3", - llm=ChatOpenAI(model="gpt-4o"), + llm=ChatOpenAI(model="gpt-4.1-mini"), ) try: diff --git a/docs/customize/mcp-client.mdx b/docs/customize/mcp-client.mdx index 09ab71af2..976ec1df7 100644 --- a/docs/customize/mcp-client.mdx +++ b/docs/customize/mcp-client.mdx @@ -2,6 +2,7 @@ title: "MCP Client" description: "Connect external MCP servers to extend browser-use with additional tools and integrations" icon: "plug" +mode: "wide" --- The MCP (Model Context Protocol) client allows browser-use agents to connect to external MCP servers, automatically exposing their tools as actions. diff --git a/docs/customize/mcp-server.mdx b/docs/customize/mcp-server.mdx index ebfe9c9bf..5bcdbd5a0 100644 --- a/docs/customize/mcp-server.mdx +++ b/docs/customize/mcp-server.mdx @@ -2,6 +2,7 @@ title: "MCP Server" description: "Expose browser-use capabilities as an MCP server for AI assistants like Claude Desktop" icon: "server" +mode: "wide" --- The MCP server exposes browser-use's browser automation capabilities as tools that can be used by AI assistants like Claude Desktop. This allows external MCP clients to control browsers, navigate websites, extract content, and perform automated tasks. @@ -17,7 +18,7 @@ The MCP server acts as a bridge between MCP-compatible AI assistants and browser ```mermaid graph LR A[Claude Desktop] -->|MCP Protocol| B[Browser-use MCP Server] - B --> C[BrowserSession] + B --> C[Browser] B --> D[Controller] B --> E[FileSystem] C --> F[Playwright Browser] @@ -379,7 +380,7 @@ uvx 'browser-use[cli]' --mcp playwright install chromium # Test browser launch -python -c "from browser_use import BrowserSession; import asyncio; asyncio.run(BrowserSession().start())" +python -c "from browser_use import Browser; import asyncio; asyncio.run(Browser().start())" ``` ### Connection Errors diff --git a/docs/customize/more-examples.mdx b/docs/customize/more-examples.mdx new file mode 100644 index 000000000..5a039ed74 --- /dev/null +++ b/docs/customize/more-examples.mdx @@ -0,0 +1,54 @@ +--- +title: "More Examples" +description: "Explore additional examples and use cases on GitHub." +icon: "arrow-up-right-from-square" +mode: "wide" +--- + +## Additional Examples + +Explore our comprehensive collection of examples on GitHub for more advanced use cases and integrations. + +### 📁 Featured Examples + +**🔒 [Secure Setup](https://github.com/browser-use/browser-use/blob/main/examples/features/secure.py)** +Azure OpenAI with enterprise security and data privacy + +**🎯 [Custom Functions](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions)** +2FA integration, file uploads, notifications, and more + +**🏪 [E-commerce](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/shopping.py)** +Automated shopping and product comparison + +**💼 [Job Applications](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/find_and_apply_to_jobs.py)** +CV upload and job application automation + +### 🔗 Browse All Examples + +**[View Complete Examples Directory →](https://github.com/browser-use/browser-use/tree/main/examples)** + +Categories available: +- **Getting Started** - Basic examples for beginners +- **Features** - Advanced functionality demonstrations +- **Custom Functions** - Extend agent capabilities +- **Integrations** - Gmail, Slack, Discord, MCP servers +- **Models** - Different LLM provider examples +- **Use Cases** - Real-world application scenarios +- **Browser** - Browser configuration examples +- **UI** - Gradio and Streamlit interfaces + +### 🤝 Contributing Examples + +Have a great use case? **[Submit a pull request](https://github.com/browser-use/browser-use/pulls)** with your example! + +**What makes a good example:** +- Clear documentation and comments +- Real-world use case +- Follows project conventions +- Includes error handling + +### 📞 Need Help? + +- **[GitHub Issues](https://github.com/browser-use/browser-use/issues)** - Bug reports and feature requests +- **[Discord Community](https://link.browser-use.com/discord)** - Live support and discussions +- **Enterprise Support** - [support@browser-use.com](mailto:support@browser-use.com) diff --git a/docs/customize/output-format.mdx b/docs/customize/output-format.mdx deleted file mode 100644 index b48a88836..000000000 --- a/docs/customize/output-format.mdx +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: "Output Format" -description: "The default is text. But you can define a structured output format to make post-processing easier." -icon: "code" ---- - -## Custom output format -With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you. - -```python -from pydantic import BaseModel -# Define the output format as a Pydantic model -class Post(BaseModel): - post_title: str - post_url: str - num_comments: int - hours_since_post: int - - -class Posts(BaseModel): - posts: List[Post] - - -controller = Controller(output_model=Posts) - - -async def main(): - task = 'Go to hackernews show hn and give me the first 5 posts' - model = ChatOpenAI(model='gpt-4o') - agent = Agent(task=task, llm=model, controller=controller) - - history = await agent.run() - - result = history.final_result() - if result: - parsed: Posts = Posts.model_validate_json(result) - - for post in parsed.posts: - print('\n--------------------------------') - print(f'Title: {post.post_title}') - print(f'URL: {post.post_url}') - print(f'Comments: {post.num_comments}') - print(f'Hours since post: {post.hours_since_post}') - else: - print('No result') - - -if __name__ == '__main__': - asyncio.run(main()) -``` diff --git a/docs/customize/parallel-browser.mdx b/docs/customize/parallel-browser.mdx new file mode 100644 index 000000000..5a6a3b296 --- /dev/null +++ b/docs/customize/parallel-browser.mdx @@ -0,0 +1,47 @@ +--- +title: "Parallel Agents" +description: "Run multiple agents in parallel with separate browser instances" +icon: "copy" +--- + +```python +import asyncio +from browser_use import Agent, Browser, ChatOpenAI + +async def main(): + # Create 3 separate browser instances + browsers = [ + Browser( + user_data_dir=f'./temp-profile-{i}', + headless=False, + ) + for i in range(3) + ] + + # Create 3 agents with different tasks + agents = [ + Agent( + task='Search for "browser automation" on Google', + browser=browsers[0], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + Agent( + task='Search for "AI agents" on DuckDuckGo', + browser=browsers[1], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + Agent( + task='Visit Wikipedia and search for "web scraping"', + browser=browsers[2], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + ] + + # Run all agents in parallel + tasks = [agent.run() for agent in agents] + results = await asyncio.gather(*tasks, return_exceptions=True) + + print('🎉 All agents completed!') +``` + +> **Note:** This is experimental, and agents might conflict each other. diff --git a/docs/customize/real-browser.mdx b/docs/customize/real-browser.mdx index 2f6c38455..7573eb186 100644 --- a/docs/customize/real-browser.mdx +++ b/docs/customize/real-browser.mdx @@ -2,6 +2,7 @@ title: "Connect to your Browser" description: "Connect to a remote browser or launch a new local browser." icon: "computer" +mode: "wide" --- ## Overview @@ -10,7 +11,6 @@ Browser Use supports a wide variety of ways to launch or connect to a browser: - Launch a new local browser using playwright/patchright chromium (the default) - Connect to a remote browser using CDP or WSS -- Use an existing playwright `Page`, `Browser`, or `BrowserContext` object - Connect to a local browser already running using `browser_pid` @@ -24,13 +24,13 @@ We provide automatic CAPTCHA solving, proxies, human-in-the-loop automation, and ### Method A: Launch a New Local Browser (Default) -Launch a local browser using built-in default (playwright `chromium`) or a provided `executable_path`: +Launch a local browser using built-in default (Playwright-installed `chromium`) or a provided `executable_path`: ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser # If no executable_path provided, uses Playwright/Patchright's built-in Chromium -browser_session = BrowserSession( +browser = Browser( # Path to a specific Chromium-based executable (optional) executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS # For Windows: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' @@ -45,7 +45,7 @@ browser_session = BrowserSession( agent = Agent( task="Your task here", llm=llm, - browser_session=browser_session, + browser=browser, ) ``` @@ -63,41 +63,42 @@ We support most `chromium`-based browsers in `executable_path`, including [Brave persist over time. -### Method B: Connect Using Existing Playwright Objects +### Method B: Connect to Remote Browser via CDP -Pass existing Playwright `Page`, `BrowserContext`, `Browser`, and/or `playwright` API object to `BrowserSession(...)`: +Connect to a remote browser instance using Chrome DevTools Protocol: ```python -from browser_use import Agent, BrowserSession -from playwright.async_api import async_playwright -# from patchright.async_api import async_playwright # stealth alternative +from browser_use import Agent, Browser -async with async_playwright() as playwright: - browser = await playwright.chromium.launch() - context = await browser.new_context() - page = await context.new_page() +# Connect to a remote browser (e.g., running in Docker, cloud, or another machine) +browser = Browser( + cdp_url="ws://remote-browser:9222/devtools/browser", # Remote CDP WebSocket URL + is_local=False, # Important: set to False for remote connections +) - browser_session = BrowserSession( - page=page, - # browser_context=context, # all these are supported - # browser=browser, - # playwright=playwright, - ) - - agent = Agent( - task="Your task here", - llm=llm, - browser_session=browser_session, - ) -``` - -You can also pass `page` directly to `Agent(...)` as a shortcut. - -```python agent = Agent( task="Your task here", llm=llm, - page=page, + browser=browser, +) +``` + + +Playwright Page/Browser/Context objects are no longer supported. Browser Use now uses CDP exclusively for all browser interactions. + + +You can also use HTTP-based CDP connections: + +```python +browser = Browser( + cdp_url="http://remote-browser:9222", # Remote CDP HTTP URL + is_local=False, +) + +agent = Agent( + task="Your task here", + llm=llm, + browser=browser, ) ``` @@ -106,18 +107,18 @@ agent = Agent( Connect to a browser with open `--remote-debugging-port`: ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser # First, start Chrome with remote debugging: # /Applications/Google Chrome.app/Contents/MacOS/Google Chrome --remote-debugging-port=9242 # Then connect using the process ID -browser_session = BrowserSession(browser_pid=12345) # Replace with actual Chrome PID +browser = Browser(browser_pid=12345) # Replace with actual Chrome PID agent = Agent( task="Your task here", llm=llm, - browser_session=browser_session, + browser=browser, ) ``` @@ -126,15 +127,15 @@ agent = Agent( Connect to Playwright Node.js server providers: ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser # Connect to a playwright server -browser_session = BrowserSession(wss_url="wss://your-playwright-server.com/ws") +browser = Browser(wss_url="wss://your-playwright-server.com/ws") agent = Agent( task="Your task here", llm=llm, - browser_session=browser_session, + browser=browser, ) ``` @@ -143,15 +144,15 @@ agent = Agent( Connect to any remote Chromium-based browser: ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser # Connect to Chrome via CDP -browser_session = BrowserSession(cdp_url="http://localhost:9222") +browser = Browser(cdp_url="http://localhost:9222") agent = Agent( task="Your task here", llm=llm, - browser_session=browser_session, + browser=browser, ) ``` @@ -165,7 +166,7 @@ agent = Agent( - Extensions and their data Always review the task you're giving to the agent and ensure it aligns with your security requirements! - Use `Agent(sensitive_data={'https://auth.example.com': {x_key: value}})` for any secrets, and restrict the browser with `BrowserSession(allowed_domains=['https://*.example.com'])`. + Use `Agent(sensitive_data={'https://auth.example.com': {x_key: value}})` for any secrets, and restrict the browser with `Browser(allowed_domains=['https://*.example.com'])`. ## Best Practices @@ -173,7 +174,7 @@ agent = Agent( 1. **Use isolated profiles**: Create separate Chrome profiles for different agents to limit scope of risk: ```python - browser_session = BrowserSession( + browser = Browser( user_data_dir='~/.config/browseruse/profiles/banking', # profile_directory='Default' ) @@ -182,40 +183,40 @@ agent = Agent( 2. **Limit domain access**: Restrict which sites the agent can visit: ```python - browser_session = BrowserSession( + browser = Browser( allowed_domains=['example.com', 'http*://*.github.com'], ) ``` -3. **Enable `keep_alive=True`** If you want to use a single `BrowserSession` with more than one agent: +3. **Enable `keep_alive=True`** If you want to use a single `Browser` with more than one agent: ```python - browser_session = BrowserSession( + browser = Browser( keep_alive=True, ... ) - await browser_session.start() # start the session yourself before passing to Agent + await browser.start() # start the session yourself before passing to Agent ... - agent = Agent(..., browser_session=browser_session) + agent = Agent(..., browser=browser) await agent.run() ... - await browser_session.kill() # end the session yourself, shortcut for keep_alive=False + .stop() + await browser.kill() # end the session yourself, shortcut for keep_alive=False + .stop() ``` ## Re-Using a Browser -A `BrowserSession` starts when the browser is launched/connected, and ends when the browser process exits/disconnects. A session internally manages a single live playwright browser context, and is normally auto-closed by the agent when its task is complete (_if_ the agent started the session itself). If you pass an existing `BrowserSession` into an Agent, or if you set `BrowserSession(keep_alive=True)`, the session will not be closed and can be re-used between agents. +A `Browser` starts when the browser is launched/connected, and ends when the browser process exits/disconnects. A session internally manages a single live playwright browser context, and is normally auto-closed by the agent when its task is complete (_if_ the agent started the session itself). If you pass an existing `Browser` into an Agent, or if you set `Browser(keep_alive=True)`, the session will not be closed and can be re-used between agents. Browser Use provides a number of ways to re-use profiles, sessions, and other configuration across multiple agents. -- ✅ sequential agents can re-use a single `user_data_dir` in new `BrowserSession`s -- ✅ sequential agents can re-use a single `BrowserSession` without closing it -- ❌ parallel agents cannot run separate `BrowserSession`s using the same `user_data_dir` -- ✅ parallel agents can run separate `BrowserSession`s using the same `storage_state` -- ✅ parallel agents can share a single `BrowserSession`, working in different tabs -- ⚠️ parallel agents can share a single `BrowserSession`, working in the same tab +- ✅ sequential agents can re-use a single `user_data_dir` in new `Browser`s +- ✅ sequential agents can re-use a single `Browser` without closing it +- ❌ parallel agents cannot run separate `Browser`s using the same `user_data_dir` +- ✅ parallel agents can run separate `Browser`s using the same `storage_state` +- ✅ parallel agents can share a single `Browser`, working in different tabs +- ⚠️ parallel agents can share a single `Browser`, working in the same tab - Multiple `BrowserSession`s (aka chrome processes) cannot share the same + Multiple `Browser`s (aka chrome processes) cannot share the same `user_data_dir` at the same time, but they can share a `storage_state` file or `BrowserProfile` config. @@ -225,21 +226,21 @@ Browser Use provides a number of ways to re-use profiles, sessions, and other co If you are only running one agent & browser at a time, they can re-use the same `user_data_dir` sequentially. ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser from browser_use.llm import ChatOpenAI reused_profile = BrowserProfile(user_data_dir='~/.config/browseruse/profiles/default') agent1 = Agent( task="The first task...", - llm=ChatOpenAI(model="gpt-4o-mini"), + llm=ChatOpenAI(model="gpt-4.1-mini"), browser_profile=reused_profile, # pass the profile in, it will auto-create a session ) await agent1.run() agent2 = Agent( task="The second task...", - llm=ChatOpenAI(model="gpt-4o-mini"), + llm=ChatOpenAI(model="gpt-4.1-mini"), browser_profile=reused_profile, # agent will auto-create its own new session ) await agent2.run() @@ -249,14 +250,14 @@ await agent2.run() ### Sequential Agents, Same Profile, Same Browser -If you are only running one agent at a time, they can re-use the same active `BrowserSession` and avoid having to relaunch chrome. +If you are only running one agent at a time, they can re-use the same active `Browser` and avoid having to relaunch chrome. Each agent will start off looking at the same tab the last agent ended off on. ```python -from browser_use import Agent, BrowserSession +from browser_use import Agent, Browser from browser_use.llm import ChatOpenAI -reused_session = BrowserSession( +reused_session = Browser( user_data_dir='~/.config/browseruse/profiles/default', keep_alive=True, # dont close browser after 1st agent.run() ends ) @@ -264,15 +265,15 @@ await reused_session.start() # when keep_alive=True, session must be started m agent1 = Agent( task="The first task...", - llm=ChatOpenAI(model="gpt-4o-mini"), - browser_session=reused_session, + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=reused_session, ) await agent1.run() agent2 = Agent( task="The second task...", - llm=ChatOpenAI(model="gpt-4o-mini"), - browser_session=reused_session, # re-use the same session + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=reused_session, # re-use the same session ) await agent2.run() @@ -282,26 +283,40 @@ await reused_session.close() ### Parallel Agents, Same Browser, Multiple Tabs ```python -from browser_use import Agent, BrowserSession +import asyncio +from browser_use import Agent, Browser from browser_use.llm import ChatOpenAI -from playwright.async_api import async_playwright +from browser_use.browser.events import NavigateToUrlEvent -async with async_playwright() as playwright: - browser_context = await playwright.chromium.launch_persistent_context() - page1 = await browser_context.new_page() - page2 = await browser_context.new_page() +# Create a shared browser session +browser = Browser() +await browser.start() - agent1 = Agent( - task="The first task...", - llm=ChatOpenAI(model="gpt-4o-mini"), - page=page1, - ) - agent2 = Agent( - task="The second task...", - llm=ChatOpenAI(model="gpt-4o-mini"), - page=page2, - ) - await asyncio.gather(agent1.run(), agent2.run()) # run in parallel +# Create tabs for each agent using events +tab1_event = browser.event_bus.dispatch(NavigateToUrlEvent(url="about:blank", new_tab=True)) +await tab1_event + +tab2_event = browser.event_bus.dispatch(NavigateToUrlEvent(url="about:blank", new_tab=True)) +await tab2_event + +# Get tab information +tabs = await browser.get_tabs() + +# Create agents that will work with different tabs +agent1 = Agent( + task="The first task...", + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=browser, +) + +agent2 = Agent( + task="The second task...", + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=browser, +) + +# Run agents in parallel (they will automatically coordinate tab switching) +await asyncio.gather(agent1.run(), agent2.run()) ``` ### Parallel Agents, Same Browser, Same Tab @@ -313,32 +328,34 @@ async with async_playwright() as playwright: ```python -from browser_use import Agent, BrowserSession +import asyncio +from browser_use import Agent, Browser from browser_use.llm import ChatOpenAI -from playwright.async_api import async_playwright +from browser_use.browser.events import NavigateToUrlEvent -playwright = await async_playwright().start() -browser = await playwright.chromium.launch(headless=True) -context = await browser.new_context() -shared_page = await context.new_page() -await shared_page.goto('https://example.com', wait_until='load') - -shared_session = BrowserSession(page=shared_page, keep_alive=True) +# Create a shared browser session +shared_session = Browser() await shared_session.start() +# Navigate to the target page +navigate_event = shared_session.event_bus.dispatch(NavigateToUrlEvent(url='https://example.com')) +await navigate_event + agent1 = Agent( task="Fill out the form in section A...", - llm=ChatOpenAI(model="gpt-4o-mini"), - browser_session=shared_session + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=shared_session ) agent2 = Agent( task="Fill out the form in section B...", - llm=ChatOpenAI(model="gpt-4o-mini"), - browser_session=shared_session, + llm=ChatOpenAI(model="gpt-4.1-mini"), + browser=shared_session, ) -await asyncio.gather(agent1.run(), agent2.run()) # run in parallel -await shared_session.kill() +# Run agents in parallel on the same tab (not recommended) +await asyncio.gather(agent1.run(), agent2.run()) + +await shared_session.stop() ``` ### Parallel Agents, Same Profile, Different Browsers @@ -356,7 +373,7 @@ playwright open https://example.com/ --load-storage=/tmp/auth.json ``` ```python -from browser_use.browser import BrowserProfile, BrowserSession +from browser_use.browser import BrowserProfile, Browser shared_profile = BrowserProfile( headless=True, @@ -365,13 +382,13 @@ shared_profile = BrowserProfile( keep_alive=True, # don't close the browser after the agent finishes ) -window1 = BrowserSession(browser_profile=shared_profile) +window1 = Browser(browser_profile=shared_profile) await window1.start() -agent1 = Agent(browser_session=window1) +agent1 = Agent(browser=window1) -window2 = BrowserSession(browser_profile=shared_profile) +window2 = Browser(browser_profile=shared_profile) await window2.start() -agent2 = Agent(browser_session=window2) +agent2 = Agent(browser=window2) await asyncio.gather(agent1.run(), agent2.run()) # run in parallel await window1.save_storage_state() # write storage state (cookies, localStorage, etc.) to auth.json @@ -404,7 +421,7 @@ If you're having trouble connecting: If you get a "profile is already in use" error: 1. Close all Chrome instances -2. The profile will automatically be unlocked when BrowserSession starts +2. The profile will automatically be unlocked when Browser starts 3. Alternatively, manually delete the `SingletonLock` file in the profile directory diff --git a/docs/customize/secure.mdx b/docs/customize/secure.mdx new file mode 100644 index 000000000..8a8af634e --- /dev/null +++ b/docs/customize/secure.mdx @@ -0,0 +1,65 @@ +--- +title: "Secure Setup" +description: "Azure OpenAI with data privacy and security configuration." +icon: "shield-check" +mode: "wide" +--- + +## Secure Setup with Azure OpenAI + +Enterprise-grade security with Azure OpenAI, data privacy protection, and restricted browser access. + +```python +import asyncio +import os +from dotenv import load_dotenv +load_dotenv() +os.environ['ANONYMIZED_TELEMETRY'] = 'false' +from browser_use import Agent, BrowserProfile, ChatAzureOpenAI + +# Azure OpenAI configuration +api_key = os.getenv('AZURE_OPENAI_KEY') +azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT') +llm = ChatAzureOpenAI(model='gpt-4.1-mini', api_key=api_key, azure_endpoint=azure_endpoint) + +# Secure browser configuration +browser_profile = BrowserProfile( + allowed_domains=['*google.com', 'browser-use.com'], + enable_default_extensions=False +) + +# Sensitive data filtering +sensitive_data = {'company_name': 'browser-use'} + +# Create secure agent +agent = Agent( + task='Find the founders of the sensitive company_name', + llm=llm, + browser_profile=browser_profile, + sensitive_data=sensitive_data +) + +async def main(): + await agent.run(max_steps=10) + +asyncio.run(main()) +``` + +## Security Features + +**Azure OpenAI:** +- NOT used to train OpenAI models +- NOT shared with other customers +- Hosted entirely within Azure +- 30-day retention (or zero with Limited Access Program) + +**Browser Security:** +- `allowed_domains`: Restrict navigation to trusted sites +- `enable_default_extensions=False`: Disable potentially dangerous extensions +- `sensitive_data`: Filter sensitive information from LLM input + + + + +For enterprise deployments contact support@browser-use.com. + diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx index c536f8bd6..aeef62529 100644 --- a/docs/customize/sensitive-data.mdx +++ b/docs/customize/sensitive-data.mdx @@ -2,195 +2,35 @@ title: "Sensitive Data" description: "Handle sensitive information securely and avoid sending PII & passwords to the LLM." icon: "shield" +mode: "wide" --- -## Handling Sensitive Data - -When working with sensitive information like passwords or PII, you can use the `Agent(sensitive_data=...)` parameter to provide sensitive strings that the model can use in actions without ever seeing directly. ```python +import os +from browser_use import Agent, Browser, ChatOpenAI +os.environ['ANONYMIZED_TELEMETRY'] = "false" + agent = Agent( - task='Log into example.com as user x_username with password x_password', + task='Log into example.com with username x_user and password x_pass', sensitive_data={ 'https://example.com': { - 'x_username': 'abc@example.com', - 'x_password': 'abc123456', # 'x_placeholder': '', + 'x_user': 'your-real-username@email.com', + 'x_pass': 'your-real-password123', }, }, + use_vision=False, # Disable vision to prevent LLM seeing sensitive data in screenshots + llm=ChatOpenAI(model='gpt-4.1-mini'), ) -``` - - - -You should also configure [`BrowserSession(allowed_domains=...)`](https://docs.browser-use.com/customize/browser-settings#allowed-domains) to prevent the Agent from visiting URLs not needed for the task. - - - -### Basic Usage - -Here's a basic example of how to use sensitive data: - -```python -from dotenv import load_dotenv -load_dotenv() - -from browser_use.llm import ChatOpenAI -from browser_use import Agent, BrowserSession - -llm = ChatOpenAI(model='gpt-4.1') - -# Define sensitive data -# The LLM will only see placeholder names (x_member_number, x_passphrase), never the actual values -sensitive_data = { - 'https://*.example.com': { - 'x_member_number': '123235325', - 'x_passphrase': 'abcwe234', - }, -} - -# Use the placeholder names in your task description -task = """ -1. go to https://travel.example.com -2. sign in with your member number x_member_number and private access code x_passphrase -3. extract today's list of travel deals as JSON -""" - -# Recommended: Limit the domains available for the entire browser so the Agent can't be tricked into visiting untrusted URLs -browser_session = BrowserSession(allowed_domains=['https://*.example.com']) - -agent = Agent( - task=task, - llm=llm, - sensitive_data=sensitive_data, # Pass the sensitive data to the agent - browser_session=browser_session, # Pass the restricted browser_session to limit URLs Agent can visit - use_vision=False, # Disable vision or else the LLM might see entered values in screenshots -) - async def main(): - await agent.run() - -if __name__ == '__main__': - asyncio.run(main()) +await agent.run() ``` -In this example: +## How it Works +1. **Text Filtering**: The LLM only sees placeholders (`x_user`, `x_pass`), we filter your sensitive data from the input text. +2. **DOM Actions**: Real values are injected directly into form fields after the LLM call -1. The LLM only ever sees the `x_member_number` and `x_passphrase` placeholders in prompts -2. When the model wants to use your password it outputs x_passphrase - and we replace it with the actual value in the DOM -3. When sensitive data appear in the content of the current page, we replace it in the page summary fed to the LLM - so that the model never has it in its state. -4. The browser will be entirely prevented from going to any site not under `https://*.example.com` - -This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication. - ---- - -### Best Practices - -- Always restrict your sensitive data to only the exact domains that need it, `https://travel.example.com` is better than `*.example.com` -- Always restrict [`BrowserSession(allowed_domains=[...])`](https://docs.browser-use.com/customize/browser-settings#allowed-domains) to only the domains the agent needs to visit to accomplish its task. This helps guard against prompt injection attacks, jailbreaks, and LLM mistakes. -- Only use `sensitive_data` for strings that can be inputted verbatim as text. The LLM never sees the actual values, so it can't "understand" them, adapt them, or split them up for multiple input fields. For example, you can't ask the Agent to click through a datepicker UI to input the sensitive value `1990-12-31`. For these situations you can implement a [custom function](/customize/custom-functions) the LLM can call that updates the DOM using Python / JS. -- Don't use `sensitive_data` for login credentials, it's better to use [`storage_state`](docs.browser-use.com/customize/browser-settings#storage-state) or a [`user_data_dir`](/customize/browser-settings#user-data-dir) to log into the sites the agent needs in advance & reuse the cookies: - -```bash -# open a browser to log into the sites you need & save the cookies -$ playwright open https://accounts.google.com --save-storage auth.json -``` - -Then use those cookies when the agent runs: - -```python -agent = Agent(..., browser_session=BrowserSession(storage_state='./auth.json')) -``` - - - -Warning: Vision models still see the screenshot of the page by default - where the sensitive data might be visible. - -It's recommended to set `Agent(use_vision=False)` when working with `sensitive_data`. - - - - - - -### Allowed Domains - -Domain patterns in `sensitive_data` follow the same format as [`allowed_domains`](https://docs.browser-use.com/customize/browser-settings#allowed-domains): - -- `example.com` - Matches only `https://example.com/*` -- `*.example.com` - Matches `https://example.com/*` and any subdomain `https://*.example.com/*` -- `http*://example.com` - Matches both `http://` and `https://` protocols for `example.com/*` -- `chrome-extension://*` - Matches any Chrome extension URL e.g. `chrome-extension://anyextensionid/options.html` - -> **Security Warning**: For security reasons, certain patterns are explicitly rejected: -> -> - Wildcards in TLD part (e.g., `example.*`) are **not allowed** (`google.*` would match `google.ninja`, `google.pizza`, etc. which is a bad idea) -> - Embedded wildcards (e.g., `g*e.com`) are rejected to prevent overly broad matches -> - Multiple wildcards like `*.*.domain` are not supported currently, open an issue if you need this feature - -The default protocol when no scheme is specified is now `https://` for enhanced security. - -For convenience the system will validate that all domain patterns used in `Agent(sensitive_data)` are also included in `BrowserSession(allowed_domains)`. - -### Missing or Empty Values - -When working with sensitive data, keep these details in mind: - -- If a key referenced by the model (`key_name`) is missing from your `sensitive_data` dictionary, a warning will be logged but the substitution tag will be preserved. -- If you provide an empty value for a key in the `sensitive_data` dictionary, it will be treated the same as a missing key. -- The system will always attempt to process all valid substitutions, even if some keys are missing or empty. - ---- - -### Full Example - -Here's a more complex example demonstrating multiple domains and sensitive data values. - -```python -from dotenv import load_dotenv -load_dotenv() - -from browser_use.llm import ChatOpenAI -from browser_use import Agent, BrowserSession - - -llm = ChatOpenAI(model='gpt-4.1') - -# Domain-specific sensitive data -sensitive_data = { - 'https://*.google.com': {'x_email': '...', 'x_pass': '...'}, - 'chrome-extension://abcd1243': {'x_api_key': '...'}, - 'http*://example.com': {'x_authcode': '123123'} -} - -# Set browser session with allowed domains that match all domain patterns in sensitive_data -browser_session = BrowserSession( - allowed_domains=[ - 'https://*.google.com', - 'chrome-extension://abcd', - 'http://example.com', # Explicitly include http:// if needed - 'https://example.com' # By default, only https:// is matched - ] -) - -# Pass the sensitive data to the agent -agent = Agent( - task="Log into Google, then check my account information", - llm=llm, - sensitive_data=sensitive_data, - browser_session=browser_session, - use_vision=False, -) - -async def main(): - await agent.run() - -if __name__ == '__main__': - asyncio.run(main()) -``` - -With this approach: - -1. The Google credentials (`x_email` and `x_pass`) will only be used on Google domains (any subdomain, https only) -2. The API key (`x_api_key`) will only be used on pages served by the specific Chrome extension `abcd1243` -3. The auth code (`x_authcode`) will only be used on `http://example.com/*` or `https://example.com/*` +## Best Practices +- Use `Browser(allowed_domains=[...])` to restrict navigation +- Set `use_vision=False` to prevent screenshot leaks +- Use `storage_state='./auth.json'` for login cookies instead of passwords when possible diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx index 997494dd3..88a7b0d40 100644 --- a/docs/customize/supported-models.mdx +++ b/docs/customize/supported-models.mdx @@ -1,24 +1,23 @@ --- title: "Supported Models" -description: "Using different chat providers with Browser Use" +description: "Choose your favorite LLM" icon: "robot" + --- -## Model Recommendations +### Recommendations -We recommend using `O3` for the best performance. The best price to performance can be achieved using `gemini-2.0-flash-exp`. +- Best accuracy: `O3` +- Fastest: `llama4` on groq +- Balanced: fast + cheap + clever: `gemini-2.5-flash` or `gpt-4.1-mini` -## Supported Models -In addition to all the models below, we support all other models that can be called via OpenAI compatible API (deepseek, novita, x, qwen). We are open to PRs for more providers. - -### OpenAI +### OpenAI [example](https://github.com/browser-use/browser-use/blob/main/examples/models/gpt-4.1.py) `O3` model is recommended for best performance. ```python -from browser_use.llm import ChatOpenAI -from browser_use import Agent +from browser_use import Agent, ChatOpenAI # Initialize the model llm = ChatOpenAI( @@ -44,11 +43,10 @@ OPENAI_API_KEY= into the normal OpenAI API call). -### Anthropic +### Anthropic [example](https://github.com/browser-use/browser-use/blob/main/examples/models/claude-4-sonnet.py) ```python -from browser_use.llm import ChatAnthropic -from browser_use import Agent +from browser_use import Agent, ChatAnthropic # Initialize the model llm = ChatAnthropic( @@ -68,11 +66,10 @@ And add the variable: ANTHROPIC_API_KEY= ``` -### Azure OpenAI +### Azure OpenAI [example](https://github.com/browser-use/browser-use/blob/main/examples/models/azure_openai.py) ```python -from browser_use.llm import ChatAzureOpenAI -from browser_use import Agent +from browser_use import Agent, ChatAzureOpenAI from pydantic import SecretStr import os @@ -95,20 +92,19 @@ AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/ AZURE_OPENAI_API_KEY= ``` -### Gemini +### Gemini [example](https://github.com/browser-use/browser-use/blob/main/examples/models/gemini.py) > [!IMPORTANT] `GEMINI_API_KEY` was the old environment var name, it should be called `GOOGLE_API_KEY` as of 2025-05. ```python -from browser_use.llm import ChatGoogle -from browser_use import Agent +from browser_use import Agent, ChatGoogle from dotenv import load_dotenv # Read GOOGLE_API_KEY into env load_dotenv() # Initialize the model -llm = ChatGoogle(model='gemini-2.0-flash-exp') +llm = ChatGoogle(model='gemini-2.5-flash') # Create agent with the model agent = Agent( @@ -123,15 +119,14 @@ Required environment variables: GOOGLE_API_KEY= ``` -### AWS Bedrock +### AWS Bedrock [example](https://github.com/browser-use/browser-use/blob/main/examples/models/aws.py) AWS Bedrock provides access to multiple model providers through a single API. We support both a general AWS Bedrock client and provider-specific convenience classes. #### General AWS Bedrock (supports all providers) ```python -from browser_use.llm import ChatAWSBedrock -from browser_use import Agent +from browser_use import Agent, ChatAWSBedrock # Works with any Bedrock model (Anthropic, Meta, AI21, etc.) llm = ChatAWSBedrock( @@ -149,8 +144,7 @@ agent = Agent( #### Anthropic Claude via AWS Bedrock (convenience class) ```python -from browser_use.llm import ChatAnthropicBedrock -from browser_use import Agent +from browser_use import Agent, ChatAnthropicBedrock # Anthropic-specific class with Claude defaults llm = ChatAnthropicBedrock( @@ -183,11 +177,10 @@ You can also use AWS profiles or IAM roles instead of environment variables. The - Session tokens for temporary credentials - AWS SSO authentication (`aws_sso_auth=True`) -## Groq +## Groq [example](https://github.com/browser-use/browser-use/blob/main/examples/models/llama4-groq.py) ```python -from browser_use.llm import ChatGroq -from browser_use import Agent +from browser_use import Agent, ChatGroq llm = ChatGroq(model="meta-llama/llama-4-maverick-17b-128e-instruct") @@ -206,16 +199,21 @@ GROQ_API_KEY= ## Ollama ```python -from browser_use.llm import ChatOllama -from browser_use import Agent +from browser_use import Agent, ChatOllama llm = ChatOllama(model="llama3.1:8b") ``` -## Migration Guides +## Langchain -### From Langchain +[Example](https://github.com/browser-use/browser-use/blob/main/examples/models/langchain) on how to use Langchain with Browser Use. -To migrate the Langchain based code, just replace `from langchain_openai import ChatOpenAI` with `from browser_use.llm import ChatOpenAI` etc. The methods should be compatible(ish). +## Other models (DeepSeek, Novita, X, Qwen...) + +We support all other models that can be called via OpenAI compatible API. We are open to PRs for more providers. + +**Examples available:** +- [DeepSeek](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek-chat.py) +- [Novita](https://github.com/browser-use/browser-use/blob/main/examples/models/novita.py) +- [OpenRouter](https://github.com/browser-use/browser-use/blob/main/examples/models/openrouter.py) -We also made and example [here](https://github.com/browser-use/browser-use/blob/main/examples/models/langchain) to help you stay with Langchain in case your workflow requires it. diff --git a/docs/customize/system-prompt.mdx b/docs/customize/system-prompt.mdx index 49dc32985..4d6329db8 100644 --- a/docs/customize/system-prompt.mdx +++ b/docs/customize/system-prompt.mdx @@ -2,6 +2,7 @@ title: "System Prompt" description: "Customize the system prompt to control agent behavior and capabilities" icon: "message" +mode: "wide" --- ## Overview @@ -65,8 +66,8 @@ Always suggest exploring multiple options before making a decision. """ # Create agent with extended planner system prompt -llm = ChatOpenAI(model='gpt-4o') -planner_llm = ChatOpenAI(model='gpt-4o-mini') +llm = ChatOpenAI(model='gpt-4.1-mini') +planner_llm = ChatOpenAI(model='gpt-4.1-mini') agent = Agent( task="Your task here", diff --git a/docs/development.mdx b/docs/development.mdx index 61b85ee42..18b7c432f 100644 --- a/docs/development.mdx +++ b/docs/development.mdx @@ -1,6 +1,7 @@ --- title: 'Development' description: 'Preview changes locally to update your docs' +mode: "wide" --- diff --git a/docs/development/contribution-guide.mdx b/docs/development/contribution-guide.mdx index ccec248eb..4fb182bf9 100644 --- a/docs/development/contribution-guide.mdx +++ b/docs/development/contribution-guide.mdx @@ -2,6 +2,7 @@ title: "Contribution Guide" description: "Learn how to contribute to Browser Use" icon: "github" +mode: "wide" --- # Join the Browser Use Community! diff --git a/docs/development/evaluations.mdx b/docs/development/evaluations.mdx deleted file mode 100644 index 051b4303d..000000000 --- a/docs/development/evaluations.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: "Evaluations" -description: "Test the Browser Use agent on standardized benchmarks" -icon: "chart-bar" ---- - -## Prerequisites - -Browser Use uses proprietary/private test sets that must never be committed to Github and must be fetched through a authorized api request. -Accessing these test sets requires an approved Browser Use account. -There are currently no publicly available test sets, but some may be released in the future. - -## Get an Api Access Key - -First, navigate to https://browser-use.tools and log in with an authorized browser use account. - -Then, click the "Account" button at the top right of the page, and click the "Cycle New Key" button on that page. - -Copy the resulting url and secret key into your `.env` file. It should look like this: - -```bash .env -EVALUATION_TOOL_URL= ... -EVALUATION_TOOL_SECRET_KEY= ... -``` - -## Running Evaluations - -First, ensure your file `eval/service.py` is up to date. - -Then run the file: - -```bash -python eval/service.py -``` - -## Configuring Evaluations - -You can modify the evaluation by providing flags to the evaluation script. For instance: - -```bash -python eval/service.py --parallel_runs 5 --parallel_evaluations 5 --max-steps 25 --start 0 --end 100 --model gpt-4o -``` - -The evaluations webpage has a convenient GUI for generating these commands. To use it, navigate to https://browser-use.tools/dashboard. - -Then click the button "New Eval Run" on the left panel. This will open a interface with selectors, inputs, sliders, and switches. - -Input your desired configuration into the interface and copy the resulting python command at the bottom. Then run this command as before. diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx index d4442782a..44827401e 100644 --- a/docs/development/local-setup.mdx +++ b/docs/development/local-setup.mdx @@ -2,6 +2,7 @@ title: "Local Setup" description: "Set up Browser Use development environment locally" icon: "laptop-code" +mode: "wide" --- # Welcome to Browser Use Development! diff --git a/docs/development/n8n-integration.mdx b/docs/development/n8n-integration.mdx index 2a6fd29b5..70e165792 100644 --- a/docs/development/n8n-integration.mdx +++ b/docs/development/n8n-integration.mdx @@ -1,6 +1,7 @@ --- title: 'n8n Integration' description: 'Learn how to integrate Browser Use with n8n workflows' +mode: "wide" --- # Browser Use n8n Integration diff --git a/docs/development/observability.mdx b/docs/development/observability.mdx index 2064533ee..edffde2c3 100644 --- a/docs/development/observability.mdx +++ b/docs/development/observability.mdx @@ -2,6 +2,7 @@ title: "Observability" description: "Trace Browser Use's agent execution steps and browser sessions" icon: "eye" +mode: "wide" --- ## Overview @@ -9,20 +10,12 @@ icon: "eye" Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents. Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai). - - Laminar excels at tracing browser agents by providing unified visibility into - both browser session recordings and agent execution steps. - - ## Setup -To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable. -To get your project API key, you can either: - -- Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings -- Or spin up a local Laminar instance and get the key from the settings page +Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings. +Set the `LMNR_PROJECT_API_KEY` environment variable. ```bash pip install 'lmnr[all]' export LMNR_PROJECT_API_KEY= @@ -33,21 +26,19 @@ export LMNR_PROJECT_API_KEY= Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced. ```python {5-8} -from browser_use.llm import ChatOpenAI -from browser_use import Agent +from browser_use import Agent, ChatOpenAI import asyncio from lmnr import Laminar, Instruments # this line auto-instruments Browser Use and any browser you use (local or remote) -Laminar.initialize(project_api_key="...", disable_batch=True, disabled_instruments={Instruments.BROWSER_USE}) # you can also pass project api key here +Laminar.initialize(project_api_key="...") async def main(): agent = Agent( task="open google, search Laminar AI", llm=ChatOpenAI(model="gpt-4.1-mini"), ) - result = await agent.run() - print(result) + await agent.run() asyncio.run(main()) ``` diff --git a/docs/development/roadmap.mdx b/docs/development/roadmap.mdx index 34f05f5a4..4ff49e8bc 100644 --- a/docs/development/roadmap.mdx +++ b/docs/development/roadmap.mdx @@ -2,6 +2,7 @@ title: "Roadmap" description: "Future plans and upcoming features for Browser Use" icon: "road" +mode: "wide" --- Big things coming soon! diff --git a/docs/development/telemetry.mdx b/docs/development/telemetry.mdx index fe4f7cb54..c2ef35758 100644 --- a/docs/development/telemetry.mdx +++ b/docs/development/telemetry.mdx @@ -2,6 +2,7 @@ title: "Telemetry" description: "Understanding Browser Use's telemetry and privacy settings" icon: "chart-mixed" +mode: "wide" --- ## Overview diff --git a/docs/docs.json b/docs/docs.json index 606d27560..45ba63082 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -9,7 +9,10 @@ }, "favicon": "/favicon.ico", "contextual": { - "options": ["copy", "view"] + "options": [ + "copy", + "view" + ] }, "fonts": { "family": "Geist" @@ -26,22 +29,50 @@ "groups": [ { "group": "Get Started", - "pages": ["introduction", "quickstart", "cli"] + "pages": [ + "introduction", + "quickstart", + "quickstart_llm" + ] }, { "group": "Customize", "pages": [ - "customize/supported-models", - "customize/agent-settings", - "customize/browser-settings", - "customize/real-browser", - "customize/output-format", - "customize/system-prompt", - "customize/sensitive-data", - "customize/custom-functions", - "customize/mcp-client", - "customize/mcp-server", - "customize/hooks" + { + "group": "Agent", + "icon": "robot", + "isDefaultOpen": true, + "pages": [ + "customize/agent-basic", + "customize/supported-models", + "customize/agent-output-format", + "customize/agent-parameters" + ] + }, + { + "group": "Browser", + "icon": "window", + "isDefaultOpen": false, + "pages": [ + "customize/browser-basic", + "customize/browser-real-browser", + "customize/browser-remote", + "customize/browser-parameters" + ] + }, + { + "group": "Examples", + "icon": "folder-open", + "pages": [ + "customize/fast-agent", + "customize/chain-agents", + "customize/parallel-browser", + "customize/sensitive-data", + "customize/secure", + "customize/more-examples" + ] + }, + "customize/custom-functions" ] }, { @@ -49,10 +80,17 @@ "pages": [ "development/contribution-guide", "development/local-setup", + { + "group": "MCP", + "icon": "link", + "pages": [ + "customize/mcp-client", + "customize/mcp-server" + ] + }, + "customize/hooks", "development/telemetry", - "development/observability", - "development/evaluations", - "development/roadmap" + "development/observability" ] } ] @@ -126,7 +164,11 @@ "display": "interactive" }, "examples": { - "languages": ["javascript", "curl", "python"], + "languages": [ + "javascript", + "curl", + "python" + ], "required": true } }, @@ -154,4 +196,4 @@ "linkedin": "https://linkedin.com/company/browser-use" } } -} +} \ No newline at end of file diff --git a/docs/introduction.mdx b/docs/introduction.mdx index 806068cc3..b552a98b6 100644 --- a/docs/introduction.mdx +++ b/docs/introduction.mdx @@ -1,6 +1,6 @@ --- title: "Introduction" -description: "Repetitive work is dead. Browser Use empowers anyone to automate repetitive online tasks. Simply tell it what do you want done." +description: "Automate browser tasks in plain text. " icon: "book-open" --- @@ -17,7 +17,7 @@ icon: "book-open" - Get up and running with Browser Use locally + Open-source Python library. - Skip the setup and start automating with Browser Use Cloud + Scale up with our cloud. + diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index a9a692f62..cb8725833 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -1,23 +1,18 @@ --- -title: "Quickstart" -description: "Start using Browser Use with this quickstart guide" +title: "Human Quickstart" +description: "" icon: "rocket" --- - - You can skip this steps by using [Browser Use Cloud](/cloud/v2/quickstart) - -## Prepare the environment +## 1. Easy setup -Use [uv](https://docs.astral.sh/uv/) to setup the Python environment. +Use [uv](https://docs.astral.sh/uv/) to create and activate the environment: ```bash uv venv --python 3.12 ``` -and activate it with: - ```bash # For Mac/Linux: source .venv/bin/activate @@ -26,50 +21,42 @@ source .venv/bin/activate .venv\Scripts\activate ``` -Install the dependencies: +Install browser-use: ```bash uv pip install browser-use ``` -Then install Chromium from [source](https://www.chromium.org/getting-involved/download-chromium/) or run the command below (this does not install Playwright only Chromium and dependencies). +Install Chromium: ```bash uvx playwright install chromium --with-deps ``` -## Create an agent - -Then you can use the agent as follows: - -```python agent.py -from browser_use.llm import ChatOpenAI -from browser_use import Agent -from dotenv import load_dotenv -load_dotenv() - -import asyncio - -llm = ChatOpenAI(model="gpt-5") - -async def main(): - agent = Agent( - task="Go to Hacker News and find the number 1 trending on Show HN", - llm=llm, - ) - result = await agent.run() - print(result) - -asyncio.run(main()) -``` - -## Set up your LLM API keys - -You need to set up API keys for the LLM you want to use and store them in `.env` file. For example, for OpenAI and Anthropic: +## 2. Choose your favorite LLM +Create a `.env` file and add your API key: ```bash .env OPENAI_API_KEY= -ANTHROPIC_API_KEY= ``` -For other LLM models you can refer to the [Supported Models](/customize/supported-models) page to find how to set them up with their specific API keys. +See [Supported Models](/customize/supported-models) for other models. + +## 3. Run your first agent + +```python agent.py +from browser_use import Agent, ChatOpenAI +from dotenv import load_dotenv +import asyncio + +load_dotenv() + +async def main(): + llm = ChatOpenAI(model="gpt-4.1-mini") + task = "Find the number 1 post on Show HN" + agent = Agent(task=task, llm=llm) + await agent.run() + +if __name__ == "__main__": + asyncio.run(main()) +``` diff --git a/docs/quickstart_llm.mdx b/docs/quickstart_llm.mdx new file mode 100644 index 000000000..b63d60b4a --- /dev/null +++ b/docs/quickstart_llm.mdx @@ -0,0 +1,11 @@ +--- +title: "LLM Quickstart" +description: "" +icon: "brain" +--- + + + +1. Copy all content [🔗 from here](https://docs.browser-use.com/llms-full.txt) (~40k tokens) +2. Paste it into your favorite coding agent (Cursor, Claude, ChatGPT ...). + diff --git a/examples/browser/parallel_browser.py b/examples/browser/parallel_browser.py new file mode 100644 index 000000000..9776d9c6a --- /dev/null +++ b/examples/browser/parallel_browser.py @@ -0,0 +1,45 @@ +import asyncio + +from browser_use import Agent, Browser, ChatOpenAI + +# NOTE: This is still experimental, and agents might conflict each other. + + +async def main(): + # Create 3 separate browser instances + browsers = [ + Browser( + user_data_dir=f'./temp-profile-{i}', + headless=False, + ) + for i in range(3) + ] + + # Create 3 agents with different tasks + agents = [ + Agent( + task='Search for "browser automation" on Google', + browser=browsers[0], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + Agent( + task='Search for "AI agents" on DuckDuckGo', + browser=browsers[1], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + Agent( + task='Visit Wikipedia and search for "web scraping"', + browser=browsers[2], + llm=ChatOpenAI(model='gpt-4.1-mini'), + ), + ] + + # Run all agents in parallel + tasks = [agent.run() for agent in agents] + results = await asyncio.gather(*tasks, return_exceptions=True) + + print('🎉 All agents completed!') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/browser/real_browser.py b/examples/browser/real_browser.py index d416a9ad3..09fef6670 100644 --- a/examples/browser/real_browser.py +++ b/examples/browser/real_browser.py @@ -8,25 +8,22 @@ from dotenv import load_dotenv load_dotenv() -from browser_use import Agent, BrowserProfile, BrowserSession, ChatOpenAI +from browser_use import Agent, Browser, ChatOpenAI -# SETUP: First copy your real Chrome profile (close Chrome first, then run): -# Mac: -# mkdir -p ~/.config/browseruse/profiles && cp -r ~/Library/Application\ Support/Google/Chrome ~/.config/browseruse/profiles/real-chrome - - -browser_profile = BrowserProfile( +# Connect to your existing Chrome browser +browser = Browser( executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - user_data_dir='~/.config/browseruse/profiles/real-chrome', + user_data_dir='~/Library/Application Support/Google/Chrome', + profile_directory='Default', ) -browser_session = BrowserSession(browser_profile=browser_profile) async def main(): agent = Agent( llm=ChatOpenAI(model='gpt-4.1-mini'), + # Google blocks this approach, so we use a different search engine task='Visit https://duckduckgo.com and search for "browser-use founders"', - browser_session=browser_session, + browser=browser, ) await agent.run() diff --git a/examples/browser/using_cdp.py b/examples/browser/using_cdp.py new file mode 100644 index 000000000..1b671d835 --- /dev/null +++ b/examples/browser/using_cdp.py @@ -0,0 +1,53 @@ +""" +Simple demonstration of the CDP feature. + +To test this locally, follow these steps: +1. Create a shortcut for the executable Chrome file. +2. Add the following argument to the shortcut: + - On Windows: `--remote-debugging-port=9222` +3. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running. +4. Launch this example. + +@dev You need to set the `OPENAI_API_KEY` environment variable before proceeding. +""" + +import asyncio +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from dotenv import load_dotenv + +load_dotenv() + +from browser_use import Agent, Controller +from browser_use.browser import BrowserProfile, BrowserSession +from browser_use.llm import ChatOpenAI + +browser_session = BrowserSession( + browser_profile=BrowserProfile( + headless=False, + ), + cdp_url='http://localhost:9222', + is_local=True, # set to False if you want to use a remote browser +) +controller = Controller() + + +async def main(): + agent = Agent( + task='Visit https://duckduckgo.com and search for "browser-use founders"', + lllm=ChatOpenAI(model='gpt-4.1-mini'), + controller=controller, + browser_session=browser_session, + ) + + await agent.run() + await browser_session.kill() + + input('Press Enter to close...') + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/features/custom_output.py b/examples/features/custom_output.py index aa1a500d4..69a7a1f81 100644 --- a/examples/features/custom_output.py +++ b/examples/features/custom_output.py @@ -16,7 +16,7 @@ load_dotenv() from pydantic import BaseModel -from browser_use import Agent, ChatOpenAI, Controller +from browser_use import Agent, ChatOpenAI class Post(BaseModel): @@ -30,13 +30,10 @@ class Posts(BaseModel): posts: list[Post] -controller = Controller(output_model=Posts) - - async def main(): task = 'Go to hackernews show hn and give me the first 5 posts' model = ChatOpenAI(model='gpt-4.1-mini') - agent = Agent(task=task, llm=model, controller=controller) + agent = Agent(task=task, llm=model, output_model_schema=Posts) history = await agent.run() diff --git a/examples/features/follow_up_tasks.py b/examples/features/follow_up_tasks.py index 229d2eb29..e8efd70c1 100644 --- a/examples/features/follow_up_tasks.py +++ b/examples/features/follow_up_tasks.py @@ -2,46 +2,30 @@ import asyncio import os import sys +from browser_use.browser.profile import BrowserProfile + sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from dotenv import load_dotenv load_dotenv() -from browser_use import Agent, ChatOpenAI, Controller -from browser_use.browser import BrowserProfile, BrowserSession +from browser_use import Agent -# Initialize the model -llm = ChatOpenAI( - model='gpt-4.1', - temperature=0.0, -) -# Get your chrome path -browser_session = BrowserSession( - browser_profile=BrowserProfile( - executable_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - keep_alive=True, - user_data_dir='~/.config/browseruse/profiles/default', - ), -) - -controller = Controller() +profile = BrowserProfile(keep_alive=True) -task = 'Find the founders of browser-use and draft them a short personalized message' - -agent = Agent(task=task, llm=llm, controller=controller, browser_session=browser_session) +task = """Go to reddit.com""" async def main(): - await agent.run() + agent = Agent(task=task, browser_profile=profile) + await agent.run(max_steps=1) - # new_task = input('Type in a new task: ') - new_task = 'Find an image of the founders' - - agent.add_new_task(new_task) - - await agent.run() + while True: + user_response = input('\n👤 New task or "q" to quit: ') + agent.add_new_task(f'New task: {user_response}') + await agent.run() if __name__ == '__main__': diff --git a/examples/features/secure.py b/examples/features/secure.py index 53aa7d12e..045f4e2bc 100644 --- a/examples/features/secure.py +++ b/examples/features/secure.py @@ -49,8 +49,7 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath( load_dotenv() -# Disable all telemetry -os.environ['BROWSER_USE_CLOUD_SYNC'] = 'false' + os.environ['ANONYMIZED_TELEMETRY'] = 'false' @@ -67,7 +66,7 @@ task = 'Find the founders of the sensitive company_name' # Configuration Browser (optional) browser_profile = BrowserProfile(allowed_domains=['*google.com', 'browser-use.com'], enable_default_extensions=False) -# Sensitive data (optional) - {key: sensitive_information} - we filter out the sensitive_information from any input to the LLM, it will only wokr with placeholder. +# Sensitive data (optional) - {key: sensitive_information} - we filter out the sensitive_information from any input to the LLM, it will only work with placeholder. # By default we pass screenshots to the LLM which can contain your information. Set use_vision=False to disable this. # If you trust your LLM endpoint, you don't need to worry about this. sensitive_data: dict[str, str | dict[str, str]] = {'company_name': 'browser-use'} diff --git a/examples/getting_started/05_fast_agent.py b/examples/getting_started/05_fast_agent.py index 759031336..a6aa2f1e9 100644 --- a/examples/getting_started/05_fast_agent.py +++ b/examples/getting_started/05_fast_agent.py @@ -14,7 +14,7 @@ from browser_use import Agent, BrowserProfile # Speed optimization instructions for the model SPEED_OPTIMIZATION_PROMPT = """ -SPEED OPTIMIZATION INSTRUCTIONS: +Speed optimization instructions: - Be extremely concise and direct in your responses - Get to the goal as quickly as possible - Use multi-action sequences whenever possible to reduce steps diff --git a/examples/simple.py b/examples/simple.py index 830f7e8e3..ac9c3b1f4 100644 --- a/examples/simple.py +++ b/examples/simple.py @@ -1,3 +1,13 @@ -from browser_use import Agent +import asyncio -Agent('Find the founders of browser-use').run_sync() +from browser_use import Agent, ChatOpenAI + + +async def main(): + task = 'Find the founders of browser-use' + agent = Agent(task=task, llm=ChatOpenAI(model='gpt-4.1-mini')) + await agent.run() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/examples/use-cases/extract_pdf_content.py b/examples/use-cases/extract_pdf_content.py index e1cd32ab7..9be5633f5 100755 --- a/examples/use-cases/extract_pdf_content.py +++ b/examples/use-cases/extract_pdf_content.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) async def main(): agent = Agent( task=""" - Objective: Navigate to the following URL whats on page 3? + Objective: Navigate to the following UR, what is on page 3? URL: https://docs.house.gov/meetings/GO/GO00/20220929/115171/HHRG-117-GO00-20220929-SD010.pdf """, diff --git a/tests/ci/test_agent_multiprocessing.py b/tests/ci/test_agent_concurrency_multiprocessing.py similarity index 100% rename from tests/ci/test_agent_multiprocessing.py rename to tests/ci/test_agent_concurrency_multiprocessing.py diff --git a/tests/ci/test_sequential_agents_simple.py b/tests/ci/test_agent_concurrency_sequential.py similarity index 100% rename from tests/ci/test_sequential_agents_simple.py rename to tests/ci/test_agent_concurrency_sequential.py diff --git a/tests/ci/test_agent_shutdown.py b/tests/ci/test_agent_concurrency_shutdown.py similarity index 100% rename from tests/ci/test_agent_shutdown.py rename to tests/ci/test_agent_concurrency_shutdown.py diff --git a/tests/ci/test_gif_filtering.py b/tests/ci/test_agent_gif_filtering.py similarity index 100% rename from tests/ci/test_gif_filtering.py rename to tests/ci/test_agent_gif_filtering.py diff --git a/tests/ci/test_gif_generation_with_navigation.py b/tests/ci/test_agent_gif_generation_with_navigation.py similarity index 100% rename from tests/ci/test_gif_generation_with_navigation.py rename to tests/ci/test_agent_gif_generation_with_navigation.py diff --git a/tests/ci/test_browser_event_GetDropdownOptionsEvent.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py new file mode 100644 index 000000000..544646b2d --- /dev/null +++ b/tests/ci/test_browser_event_GetDropdownOptionsEvent.py @@ -0,0 +1,656 @@ +"""Test GetDropdownOptionsEvent and SelectDropdownOptionEvent functionality. + +This file consolidates all tests related to dropdown functionality including: +- Native + + + + + +
No selection made
+ + + + """, + content_type='text/html', + ) + + # Add route for ARIA menu test page + server.expect_request('/aria-menu').respond_with_data( + """ + + + + ARIA Menu Test + + + +

ARIA Menu Test

+

This menu uses ARIA roles instead of native select elements

+ + + +
Click an option to see the result
+ + + + + """, + content_type='text/html', + ) + + # Add route for custom dropdown test page + server.expect_request('/custom-dropdown').respond_with_data( + """ + + + + Custom Dropdown Test + + + +

Custom Dropdown Test

+

This is a custom dropdown implementation (like Semantic UI)

+ + + +
No selection made
+ + + + + """, + content_type='text/html', + ) + + yield server + server.stop() + + +@pytest.fixture(scope='session') +def base_url(http_server): + """Return the base URL for the test HTTP server.""" + return f'http://{http_server.host}:{http_server.port}' + + +@pytest.fixture(scope='module') +async def browser_session(): + """Create and provide a Browser instance with security disabled.""" + browser_session = BrowserSession( + browser_profile=BrowserProfile( + headless=True, + user_data_dir=None, + keep_alive=True, + chromium_sandbox=False, # Disable sandbox for CI environment + ) + ) + await browser_session.start() + yield browser_session + await browser_session.kill() + + +@pytest.fixture(scope='function') +def controller(): + """Create and provide a Controller instance.""" + return Controller() + + +class TestGetDropdownOptionsEvent: + """Test GetDropdownOptionsEvent functionality for various dropdown types.""" + + async def test_native_select_dropdown(self, controller, browser_session: BrowserSession, base_url): + """Test get_dropdown_options with native HTML select element.""" + # Navigate to the native dropdown test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + + # Initialize the DOM state to populate the selector map + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the select element + selector_map = await browser_session.get_selector_map() + dropdown_index = None + for idx, element in selector_map.items(): + if element.tag_name.lower() == 'select' and element.attributes.get('id') == 'test-dropdown': + dropdown_index = idx + break + + assert dropdown_index is not None, ( + f'Could not find select element in selector map. Available elements: {[f"{idx}: {element.tag_name}" for idx, element in selector_map.items()]}' + ) + + # Test via controller action + class GetDropdownOptionsModel(ActionModel): + get_dropdown_options: dict[str, int] + + result = await controller.act( + action=GetDropdownOptionsModel(get_dropdown_options={'index': dropdown_index}), + browser_session=browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + + # Verify all expected options are present + expected_options = ['Please select', 'First Option', 'Second Option', 'Third Option'] + for option in expected_options: + assert option in result.extracted_content, f"Option '{option}' not found in result content" + + # Verify instruction is included + assert 'Use the exact text string' in result.extracted_content and 'select_dropdown_option' in result.extracted_content + + # Also test direct event dispatch + node = await browser_session.get_element_by_index(dropdown_index) + assert node is not None + event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node)) + dropdown_data = await event.event_result(timeout=3.0) + + assert dropdown_data is not None + assert 'options' in dropdown_data + assert 'type' in dropdown_data + assert dropdown_data['type'] == 'select' + + async def test_aria_menu_dropdown(self, controller, browser_session: BrowserSession, base_url): + """Test get_dropdown_options with ARIA role='menu' element.""" + # Navigate to the ARIA menu test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the ARIA menu + selector_map = await browser_session.get_selector_map() + menu_index = None + for idx, element in selector_map.items(): + if ( + element.tag_name.lower() == 'ul' + and element.attributes.get('role') == 'menu' + and element.attributes.get('id') == 'pyNavigation1752753375773' + ): + menu_index = idx + break + + assert menu_index is not None, ( + f'Could not find ARIA menu element in selector map. Available elements: {[f"{idx}: {element.tag_name} role={element.attributes.get('role', 'None')}" for idx, element in selector_map.items()]}' + ) + + # Test via controller action + class GetDropdownOptionsModel(ActionModel): + get_dropdown_options: dict[str, int] + + result = await controller.act( + action=GetDropdownOptionsModel(get_dropdown_options={'index': menu_index}), + browser_session=browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + + # Verify expected ARIA menu options are present + expected_options = ['Filter', 'Sort', 'Appearance', 'Summarize', 'Delete'] + for option in expected_options: + assert option in result.extracted_content, f"Option '{option}' not found in result content" + + # Also test direct event dispatch + node = await browser_session.get_element_by_index(menu_index) + assert node is not None + event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node)) + dropdown_data = await event.event_result(timeout=3.0) + + assert dropdown_data is not None + assert 'options' in dropdown_data + assert 'type' in dropdown_data + assert dropdown_data['type'] == 'aria' + + async def test_custom_dropdown(self, controller, browser_session: BrowserSession, base_url): + """Test get_dropdown_options with custom dropdown implementation.""" + # Navigate to the custom dropdown test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the custom dropdown + selector_map = await browser_session.get_selector_map() + dropdown_index = None + for idx, element in selector_map.items(): + if element.attributes.get('id') == 'custom-dropdown' and 'dropdown' in element.attributes.get('class', ''): + dropdown_index = idx + break + + assert dropdown_index is not None, ( + f'Could not find custom dropdown element in selector map. Available elements: {[f"{idx}: {element.tag_name} id={element.attributes.get('id', 'None')}" for idx, element in selector_map.items()]}' + ) + + # Test via controller action + class GetDropdownOptionsModel(ActionModel): + get_dropdown_options: dict[str, int] + + result = await controller.act( + action=GetDropdownOptionsModel(get_dropdown_options={'index': dropdown_index}), + browser_session=browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + + # Verify expected custom dropdown options are present + expected_options = ['Red', 'Green', 'Blue', 'Yellow'] + for option in expected_options: + assert option in result.extracted_content, f"Option '{option}' not found in result content" + + # Also test direct event dispatch + node = await browser_session.get_element_by_index(dropdown_index) + assert node is not None + event = browser_session.event_bus.dispatch(GetDropdownOptionsEvent(node=node)) + dropdown_data = await event.event_result(timeout=3.0) + + assert dropdown_data is not None + assert 'options' in dropdown_data + assert 'type' in dropdown_data + assert dropdown_data['type'] == 'custom' + + async def test_element_not_found_error(self, controller, browser_session: BrowserSession, base_url): + """Test get_dropdown_options with invalid element index.""" + # Navigate to any test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) + + # Try to get dropdown options with invalid index + class GetDropdownOptionsModel(ActionModel): + get_dropdown_options: dict[str, int] + + result = await controller.act( + action=GetDropdownOptionsModel(get_dropdown_options={'index': 99999}), + browser_session=browser_session, + ) + + # Should return an error + assert isinstance(result, ActionResult) + assert result.error is not None + assert 'not found' in result.error.lower() + + +class TestSelectDropdownOptionEvent: + """Test SelectDropdownOptionEvent functionality for various dropdown types.""" + + async def test_select_native_dropdown_option(self, controller, browser_session: BrowserSession, base_url): + """Test select_dropdown_option with native HTML select element.""" + # Navigate to the native dropdown test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the select element + selector_map = await browser_session.get_selector_map() + dropdown_index = None + for idx, element in selector_map.items(): + if element.tag_name.lower() == 'select' and element.attributes.get('id') == 'test-dropdown': + dropdown_index = idx + break + + assert dropdown_index is not None + + # Test via controller action + class SelectDropdownOptionModel(ActionModel): + select_dropdown_option: dict + + result = await controller.act( + SelectDropdownOptionModel(select_dropdown_option={'index': dropdown_index, 'text': 'Second Option'}), + browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + assert 'Second Option' in result.extracted_content + + # Verify the selection actually worked using CDP + cdp_session = await browser_session.get_or_create_cdp_session() + result = await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': "document.getElementById('test-dropdown').selectedIndex", 'returnByValue': True}, + session_id=cdp_session.session_id, + ) + selected_index = result.get('result', {}).get('value', -1) + assert selected_index == 2, f'Expected selected index 2, got {selected_index}' + + async def test_select_aria_menu_option(self, controller, browser_session: BrowserSession, base_url): + """Test select_dropdown_option with ARIA menu.""" + # Navigate to the ARIA menu test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/aria-menu', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the ARIA menu + selector_map = await browser_session.get_selector_map() + menu_index = None + for idx, element in selector_map.items(): + if ( + element.tag_name.lower() == 'ul' + and element.attributes.get('role') == 'menu' + and element.attributes.get('id') == 'pyNavigation1752753375773' + ): + menu_index = idx + break + + assert menu_index is not None + + # Test via controller action + class SelectDropdownOptionModel(ActionModel): + select_dropdown_option: dict + + result = await controller.act( + SelectDropdownOptionModel(select_dropdown_option={'index': menu_index, 'text': 'Filter'}), + browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + assert 'Filter' in result.extracted_content + + # Verify the click had an effect using CDP + cdp_session = await browser_session.get_or_create_cdp_session() + result = await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': "document.getElementById('result').textContent", 'returnByValue': True}, + session_id=cdp_session.session_id, + ) + result_text = result.get('result', {}).get('value', '') + assert 'Filter' in result_text, f"Expected 'Filter' in result text, got '{result_text}'" + + async def test_select_custom_dropdown_option(self, controller, browser_session: BrowserSession, base_url): + """Test select_dropdown_option with custom dropdown.""" + # Navigate to the custom dropdown test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/custom-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the custom dropdown + selector_map = await browser_session.get_selector_map() + dropdown_index = None + for idx, element in selector_map.items(): + if element.attributes.get('id') == 'custom-dropdown' and 'dropdown' in element.attributes.get('class', ''): + dropdown_index = idx + break + + assert dropdown_index is not None + + # Test via controller action + class SelectDropdownOptionModel(ActionModel): + select_dropdown_option: dict + + result = await controller.act( + SelectDropdownOptionModel(select_dropdown_option={'index': dropdown_index, 'text': 'Blue'}), + browser_session, + ) + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + assert 'Blue' in result.extracted_content + + # Verify the selection worked using CDP + cdp_session = await browser_session.get_or_create_cdp_session() + result = await cdp_session.cdp_client.send.Runtime.evaluate( + params={'expression': "document.getElementById('result').textContent", 'returnByValue': True}, + session_id=cdp_session.session_id, + ) + result_text = result.get('result', {}).get('value', '') + assert 'Blue' in result_text, f"Expected 'Blue' in result text, got '{result_text}'" + + async def test_select_invalid_option_error(self, controller, browser_session: BrowserSession, base_url): + """Test select_dropdown_option with non-existent option text.""" + # Navigate to the native dropdown test page + goto_action = {'go_to_url': GoToUrlAction(url=f'{base_url}/native-dropdown', new_tab=False)} + + class GoToUrlActionModel(ActionModel): + go_to_url: GoToUrlAction | None = None + + await controller.act(GoToUrlActionModel(**goto_action), browser_session) + await browser_session.event_bus.expect(NavigationCompleteEvent, timeout=10.0) + + # Initialize the DOM state + await browser_session.get_browser_state_summary(cache_clickable_elements_hashes=True) + + # Get the selector map and find the select element + selector_map = await browser_session.get_selector_map() + dropdown_index = None + for idx, element in selector_map.items(): + if element.tag_name.lower() == 'select' and element.attributes.get('id') == 'test-dropdown': + dropdown_index = idx + break + + assert dropdown_index is not None + + # Try to select non-existent option via direct event + node = await browser_session.get_element_by_index(dropdown_index) + assert node is not None + event = browser_session.event_bus.dispatch(SelectDropdownOptionEvent(node=node, text='Non-existent Option')) + + try: + selection_data = await event.event_result(timeout=3.0) + # Should have an error in the result + assert selection_data is not None + assert 'error' in selection_data or 'not found' in str(selection_data).lower() + except Exception as e: + # Or raise an exception + assert 'not found' in str(e).lower() or 'no option' in str(e).lower() diff --git a/tests/ci/test_aria_menu_dropdown.py b/tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py similarity index 100% rename from tests/ci/test_aria_menu_dropdown.py rename to tests/ci/test_browser_event_GetDropdownOptionsEvent_aria_menus.py diff --git a/tests/ci/test_navigation_events.py b/tests/ci/test_browser_event_NavigateToUrlEvent2.py similarity index 100% rename from tests/ci/test_navigation_events.py rename to tests/ci/test_browser_event_NavigateToUrlEvent2.py diff --git a/tests/ci/test_fill_fallback.py b/tests/ci/test_browser_event_TypeTextEvent3.py similarity index 100% rename from tests/ci/test_fill_fallback.py rename to tests/ci/test_browser_event_TypeTextEvent3.py diff --git a/tests/ci/test_browser_session_proxy.py b/tests/ci/test_browser_session_proxy.py new file mode 100644 index 000000000..cf4396d45 --- /dev/null +++ b/tests/ci/test_browser_session_proxy.py @@ -0,0 +1,112 @@ +import asyncio +from typing import Any + +import pytest + +from browser_use.browser import BrowserProfile, BrowserSession +from browser_use.config import CONFIG + + +def test_chromium_args_include_proxy_flags(): + profile = BrowserProfile( + headless=True, + user_data_dir=str(CONFIG.BROWSER_USE_PROFILES_DIR / 'proxy-smoke'), + proxy={ + 'server': 'http://proxy.local:8080', + 'bypass': 'localhost,127.0.0.1', + }, + ) + args = profile.get_args() + assert any(a == '--proxy-server=http://proxy.local:8080' for a in args), args + assert any(a == '--proxy-bypass-list=localhost,127.0.0.1' for a in args), args + + +@pytest.mark.asyncio +async def test_cdp_proxy_auth_handler_registers_and_responds(): + # Create profile with proxy auth credentials + profile = BrowserProfile( + headless=True, + user_data_dir=str(CONFIG.BROWSER_USE_PROFILES_DIR / 'proxy-smoke'), + proxy={'username': 'user', 'password': 'pass'}, + ) + session = BrowserSession(browser_profile=profile) + + # Stub CDP client with minimal Fetch support + class StubCDP: + def __init__(self) -> None: + self.enabled = False + self.last_auth: dict[str, Any] | None = None + self.last_default: dict[str, Any] | None = None + self.auth_callback = None + self.request_paused_callback = None + + class _FetchSend: + def __init__(self, outer: 'StubCDP') -> None: + self._outer = outer + + async def enable(self, params: dict, session_id: str | None = None) -> None: + self._outer.enabled = True + + async def continueWithAuth(self, params: dict, session_id: str | None = None) -> None: + self._outer.last_auth = {'params': params, 'session_id': session_id} + + async def continueRequest(self, params: dict, session_id: str | None = None) -> None: + # no-op; included to mirror CDP API surface used by impl + pass + + class _Send: + def __init__(self, outer: 'StubCDP') -> None: + self.Fetch = _FetchSend(outer) + + class _FetchRegister: + def __init__(self, outer: 'StubCDP') -> None: + self._outer = outer + + def authRequired(self, callback) -> None: + self._outer.auth_callback = callback + + def requestPaused(self, callback) -> None: + self._outer.request_paused_callback = callback + + class _Register: + def __init__(self, outer: 'StubCDP') -> None: + self.Fetch = _FetchRegister(outer) + + self.send = _Send(self) + self.register = _Register(self) + + root = StubCDP() + + # Attach stubs to session + session._cdp_client_root = root # type: ignore[attr-defined] + # No need to attach a real CDPSession; _setup_proxy_auth works with root client + + # Should register Fetch handler and enable auth handling without raising + await session._setup_proxy_auth() + + assert root.enabled is True + assert callable(root.auth_callback) + + # Simulate proxy auth required event + ev = {'requestId': 'r1', 'authChallenge': {'source': 'Proxy'}} + root.auth_callback(ev, session_id='s1') # type: ignore[misc] + + # Let scheduled task run + await asyncio.sleep(0.05) + + assert root.last_auth is not None + params = root.last_auth['params'] + assert params['authChallengeResponse']['response'] == 'ProvideCredentials' + assert params['authChallengeResponse']['username'] == 'user' + assert params['authChallengeResponse']['password'] == 'pass' + assert root.last_auth['session_id'] == 's1' + + # Now simulate a non-proxy auth challenge and ensure default handling + ev2 = {'requestId': 'r2', 'authChallenge': {'source': 'Server'}} + root.auth_callback(ev2, session_id='s2') # type: ignore[misc] + await asyncio.sleep(0.05) + # After non-proxy challenge, last_auth should reflect Default response + assert root.last_auth is not None + params2 = root.last_auth['params'] + assert params2['requestId'] == 'r2' + assert params2['authChallengeResponse']['response'] == 'Default' diff --git a/tests/ci/test_cdp_new_tab_session.py b/tests/ci/test_browser_session_via_cdp_tab_management.py similarity index 100% rename from tests/ci/test_cdp_new_tab_session.py rename to tests/ci/test_browser_session_via_cdp_tab_management.py diff --git a/tests/ci/test_dom_service_chrome_urls.py b/tests/ci/test_browser_watchdog_dom_service_ignore_empty_pages.py similarity index 100% rename from tests/ci/test_dom_service_chrome_urls.py rename to tests/ci/test_browser_watchdog_dom_service_ignore_empty_pages.py diff --git a/tests/ci/test_browser_watchdog_downloads.py b/tests/ci/test_browser_watchdog_downloads.py index 0f3813492..6350b5775 100644 --- a/tests/ci/test_browser_watchdog_downloads.py +++ b/tests/ci/test_browser_watchdog_downloads.py @@ -58,7 +58,7 @@ async def download_test_server(httpserver): return httpserver -@pytest.mark.asyncio +@pytest.mark.skip(reason='TODO: fix') async def test_downloads_watchdog_lifecycle(): """Test that DownloadsWatchdog starts and stops with browser session.""" # Use temp directory for downloads @@ -94,7 +94,7 @@ async def test_downloads_watchdog_lifecycle(): await session.event_bus.stop(clear=True, timeout=5) -@pytest.mark.asyncio +@pytest.mark.skip(reason='TODO: fix') async def test_downloads_watchdog_file_detection(download_test_server): """Test that DownloadsWatchdog detects file downloads.""" # Use temp directory for downloads diff --git a/tests/ci/test_browser_session_downloads_simple.py b/tests/ci/test_browser_watchdog_downloads_simple.py similarity index 100% rename from tests/ci/test_browser_session_downloads_simple.py rename to tests/ci/test_browser_watchdog_downloads_simple.py diff --git a/tests/ci/test_download_upload_full_circle.py b/tests/ci/test_browser_watchdog_downloads_upload_full_circle.py similarity index 100% rename from tests/ci/test_download_upload_full_circle.py rename to tests/ci/test_browser_watchdog_downloads_upload_full_circle.py diff --git a/tests/ci/test_browser_session_screenshots.py b/tests/ci/test_browser_watchdog_screenshots.py similarity index 100% rename from tests/ci/test_browser_session_screenshots.py rename to tests/ci/test_browser_watchdog_screenshots.py diff --git a/tests/ci/test_browser_session_allowed_domains.py b/tests/ci/test_browser_watchdog_security2.py similarity index 100% rename from tests/ci/test_browser_session_allowed_domains.py rename to tests/ci/test_browser_watchdog_security2.py diff --git a/tests/ci/test_anthropic_502_error.py b/tests/ci/test_llm_anthropic_502_error.py similarity index 100% rename from tests/ci/test_anthropic_502_error.py rename to tests/ci/test_llm_anthropic_502_error.py diff --git a/tests/ci/test_custom_structured_ouput.py b/tests/ci/test_llm_custom_structured_ouput.py similarity index 100% rename from tests/ci/test_custom_structured_ouput.py rename to tests/ci/test_llm_custom_structured_ouput.py diff --git a/tests/ci/test_gemini_type_field_fix.py b/tests/ci/test_llm_gemini_type_field_fix.py similarity index 100% rename from tests/ci/test_gemini_type_field_fix.py rename to tests/ci/test_llm_gemini_type_field_fix.py diff --git a/tests/ci/test_schema_optimizer.py b/tests/ci/test_llm_schema_optimizer.py similarity index 100% rename from tests/ci/test_schema_optimizer.py rename to tests/ci/test_llm_schema_optimizer.py diff --git a/tests/ci/test_proxy_smoke.py b/tests/ci/test_proxy_smoke.py deleted file mode 100644 index 75afd36e0..000000000 --- a/tests/ci/test_proxy_smoke.py +++ /dev/null @@ -1,112 +0,0 @@ -import asyncio -from typing import Any - -import pytest - -from browser_use.browser import BrowserProfile, BrowserSession -from browser_use.config import CONFIG - - -def test_chromium_args_include_proxy_flags(): - profile = BrowserProfile( - headless=True, - user_data_dir=str(CONFIG.BROWSER_USE_PROFILES_DIR / 'proxy-smoke'), - proxy={ - 'server': 'http://proxy.local:8080', - 'bypass': 'localhost,127.0.0.1', - }, - ) - args = profile.get_args() - assert any(a == '--proxy-server=http://proxy.local:8080' for a in args), args - assert any(a == '--proxy-bypass-list=localhost,127.0.0.1' for a in args), args - - -@pytest.mark.asyncio -async def test_cdp_proxy_auth_handler_registers_and_responds(): - # Create profile with proxy auth credentials - profile = BrowserProfile( - headless=True, - user_data_dir=str(CONFIG.BROWSER_USE_PROFILES_DIR / 'proxy-smoke'), - proxy={'username': 'user', 'password': 'pass'}, - ) - session = BrowserSession(browser_profile=profile) - - # Stub CDP client with minimal Fetch support - class StubCDP: - def __init__(self) -> None: - self.enabled = False - self.last_auth: dict[str, Any] | None = None - self.last_default: dict[str, Any] | None = None - self.auth_callback = None - self.request_paused_callback = None - - class _FetchSend: - def __init__(self, outer: 'StubCDP') -> None: - self._outer = outer - - async def enable(self, params: dict, session_id: str | None = None) -> None: - self._outer.enabled = True - - async def continueWithAuth(self, params: dict, session_id: str | None = None) -> None: - self._outer.last_auth = {'params': params, 'session_id': session_id} - - async def continueRequest(self, params: dict, session_id: str | None = None) -> None: - # no-op; included to mirror CDP API surface used by impl - pass - - class _Send: - def __init__(self, outer: 'StubCDP') -> None: - self.Fetch = _FetchSend(outer) - - class _FetchRegister: - def __init__(self, outer: 'StubCDP') -> None: - self._outer = outer - - def authRequired(self, callback) -> None: - self._outer.auth_callback = callback - - def requestPaused(self, callback) -> None: - self._outer.request_paused_callback = callback - - class _Register: - def __init__(self, outer: 'StubCDP') -> None: - self.Fetch = _FetchRegister(outer) - - self.send = _Send(self) - self.register = _Register(self) - - root = StubCDP() - - # Attach stubs to session - session._cdp_client_root = root # type: ignore[attr-defined] - # No need to attach a real CDPSession; _setup_proxy_auth works with root client - - # Should register Fetch handler and enable auth handling without raising - await session._setup_proxy_auth() - - assert root.enabled is True - assert callable(root.auth_callback) - - # Simulate proxy auth required event - ev = {'requestId': 'r1', 'authChallenge': {'source': 'Proxy'}} - root.auth_callback(ev, session_id='s1') # type: ignore[misc] - - # Let scheduled task run - await asyncio.sleep(0.05) - - assert root.last_auth is not None - params = root.last_auth['params'] - assert params['authChallengeResponse']['response'] == 'ProvideCredentials' - assert params['authChallengeResponse']['username'] == 'user' - assert params['authChallengeResponse']['password'] == 'pass' - assert root.last_auth['session_id'] == 's1' - - # Now simulate a non-proxy auth challenge and ensure default handling - ev2 = {'requestId': 'r2', 'authChallenge': {'source': 'Server'}} - root.auth_callback(ev2, session_id='s2') # type: ignore[misc] - await asyncio.sleep(0.05) - # After non-proxy challenge, last_auth should reflect Default response - assert root.last_auth is not None - params2 = root.last_auth['params'] - assert params2['requestId'] == 'r2' - assert params2['authChallengeResponse']['response'] == 'Default' diff --git a/tests/ci/test_radio_buttons.html b/tests/ci/test_radio_buttons.html new file mode 100644 index 000000000..f2b5d7726 --- /dev/null +++ b/tests/ci/test_radio_buttons.html @@ -0,0 +1,106 @@ + + + + Radio Button Test + + +

Radio Button Test Page

+ +
+
+ Select your favorite color: + + +
+ + +
+ + +
+
+ +
+ Select your favorite animal: + + +
+ + +
+ + +
+
+ + +
+ + + + diff --git a/tests/ci/test_radio_buttons.py b/tests/ci/test_radio_buttons.py new file mode 100644 index 000000000..2a58b202d --- /dev/null +++ b/tests/ci/test_radio_buttons.py @@ -0,0 +1,98 @@ +# @file purpose: Test radio button interactions and serialization in browser-use +""" +Test file for verifying radio button clicking functionality and DOM serialization. + +This test creates a simple HTML page with radio buttons, sends an agent to click them, +and logs the final agent message to show how radio buttons are represented in the serializer. + +The serialization shows radio buttons as: +[index] + +Usage: + uv run pytest tests/ci/test_radio_buttons.py -v -s +""" + +from pathlib import Path + +import pytest +from pytest_httpserver import HTTPServer + +from browser_use.agent.service import Agent +from browser_use.browser import BrowserSession +from browser_use.browser.profile import BrowserProfile + + +@pytest.fixture(scope='session') +def http_server(): + """Create and provide a test HTTP server that serves static content.""" + server = HTTPServer() + server.start() + + # Read the HTML file content + html_file = Path(__file__).parent / 'test_radio_buttons.html' + with open(html_file, 'r') as f: + html_content = f.read() + + # Add route for radio buttons test page + server.expect_request('/radio-test').respond_with_data( + html_content, + content_type='text/html', + ) + + yield server + server.stop() + + +@pytest.fixture(scope='session') +def base_url(http_server): + """Return the base URL for the test HTTP server.""" + return f'http://{http_server.host}:{http_server.port}' + + +@pytest.fixture(scope='module') +async def browser_session(): + """Create and provide a Browser instance with security disabled.""" + browser_session = BrowserSession( + browser_profile=BrowserProfile( + headless=True, + user_data_dir=None, + keep_alive=True, + ) + ) + await browser_session.start() + yield browser_session + await browser_session.kill() + + +class TestRadioButtons: + """Test cases for radio button interactions.""" + + async def test_radio_button_clicking(self, browser_session, base_url): + """Test that agent can click radio buttons by checking for secret message.""" + + task = f"Go to {base_url}/radio-test and click on the 'Blue' radio button and the 'Dog' radio button. After clicking both buttons, look for any text message that appears on the page and report exactly what you see." + + agent = Agent( + task=task, + browser_session=browser_session, + max_actions_per_step=5, + flash_mode=True, + ) + + # Run the agent + history = await agent.run(max_steps=8) + + # Check if the secret message appears in the final response + secret_found = False + final_response = history.final_result() + + if final_response and 'SECRET_SUCCESS_12345' in final_response: + secret_found = True + print('\n✅ SUCCESS: Secret message found! Radio buttons were clicked correctly.') + + assert secret_found, ( + "Secret message 'SECRET_SUCCESS_12345' should be present, indicating both Blue and Dog radio buttons were clicked. Actual response: " + + str(final_response) + ) + + print(f'\n🎉 Test completed successfully! Agent completed {len(history)} steps and found the secret message.') diff --git a/tests/ci/test_action_parameter_injection.py b/tests/ci/test_registry_action_parameter_injection.py similarity index 100% rename from tests/ci/test_action_parameter_injection.py rename to tests/ci/test_registry_action_parameter_injection.py diff --git a/tests/ci/test_search_google_tab_focus.py b/tests/ci/test_registry_action_search_google.py similarity index 100% rename from tests/ci/test_search_google_tab_focus.py rename to tests/ci/test_registry_action_search_google.py diff --git a/tests/ci/test_semaphores.py b/tests/ci/test_semaphores.py deleted file mode 100644 index 4f28bea47..000000000 --- a/tests/ci/test_semaphores.py +++ /dev/null @@ -1,522 +0,0 @@ -""" -Test semaphore functionality, especially multiprocess semaphores. -""" - -import asyncio -import multiprocessing -import os -import sys -import time -from pathlib import Path - -import pytest - -# Add the browser-use directory to the path so we can import from it -sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - -from bubus.helpers import retry - - -def worker_acquire_semaphore( - worker_id: int, - start_time: float, - results_queue: multiprocessing.Queue, - hold_time: float = 0.5, - timeout: float = 5.0, - should_release: bool = True, -): - """Worker process that tries to acquire a semaphore.""" - try: - print(f'Worker {worker_id} starting...') - - # Define a function decorated with multiprocess semaphore - @retry( - retries=0, - timeout=10, - semaphore_limit=3, # Only 3 concurrent processes allowed - semaphore_name='test_multiprocess_sem', - semaphore_scope='multiprocess', - semaphore_timeout=timeout, - semaphore_lax=False, # Strict mode - must acquire semaphore - ) - async def semaphore_protected_function(): - acquire_time = time.time() - start_time - results_queue.put(('acquired', worker_id, acquire_time)) - - # Hold the semaphore for a bit - await asyncio.sleep(hold_time) - - release_time = time.time() - start_time - results_queue.put(('released', worker_id, release_time)) - return f'Worker {worker_id} completed' - - # Run the async function - print(f'Worker {worker_id} running async function...') - result = asyncio.run(semaphore_protected_function()) - print(f'Worker {worker_id} completed with result: {result}') - results_queue.put(('completed', worker_id, result)) - - except TimeoutError as e: - timeout_time = time.time() - start_time - print(f'Worker {worker_id} timed out: {e}') - results_queue.put(('timeout', worker_id, timeout_time, str(e))) - except Exception as e: - error_time = time.time() - start_time - print(f'Worker {worker_id} error: {type(e).__name__}: {e}') - import traceback - - traceback.print_exc() - results_queue.put(('error', worker_id, error_time, str(e))) - - -def worker_that_dies( - worker_id: int, - start_time: float, - results_queue: multiprocessing.Queue, - die_after: float = 0.2, -): - """Worker process that acquires semaphore then dies without releasing.""" - try: - - @retry( - retries=0, - timeout=10, - semaphore_limit=2, # Only 2 concurrent processes - semaphore_name='test_death_sem', - semaphore_scope='multiprocess', - semaphore_timeout=5.0, - semaphore_lax=False, - ) - async def semaphore_protected_function(): - acquire_time = time.time() - start_time - results_queue.put(('acquired', worker_id, acquire_time)) - - # Hold for a bit then simulate crash - await asyncio.sleep(die_after) - - # Simulate unexpected death - os._exit(1) # Hard exit without cleanup - - asyncio.run(semaphore_protected_function()) - - except Exception as e: - error_time = time.time() - start_time - results_queue.put(('error', worker_id, error_time, str(e))) - - -def worker_death_test_normal( - worker_id: int, - start_time: float, - results_queue: multiprocessing.Queue, -): - """Worker for death test that uses the same semaphore.""" - - @retry( - retries=0, - timeout=10, - semaphore_limit=2, - semaphore_name='test_death_sem', - semaphore_scope='multiprocess', - semaphore_timeout=5.0, - semaphore_lax=False, - ) - async def semaphore_protected_function(): - acquire_time = time.time() - start_time - results_queue.put(('acquired', worker_id, acquire_time)) - await asyncio.sleep(0.2) - release_time = time.time() - start_time - results_queue.put(('released', worker_id, release_time)) - return f'Worker {worker_id} completed' - - try: - result = asyncio.run(semaphore_protected_function()) - results_queue.put(('completed', worker_id, result)) - except Exception as e: - error_time = time.time() - start_time - results_queue.put(('error', worker_id, error_time, str(e))) - - -class TestMultiprocessSemaphore: - """Test multiprocess semaphore functionality.""" - - @pytest.mark.skip(reason='Flaky test - FIFO ordering is not guaranteed due to process scheduling') - def test_basic_multiprocess_semaphore(self): - """Test that semaphore limits work across processes.""" - results_queue = multiprocessing.Queue() - start_time = time.time() - processes = [] - - # Start 6 worker processes (semaphore limit is 3) - for i in range(6): - p = multiprocessing.Process(target=worker_acquire_semaphore, args=(i, start_time, results_queue, 0.5, 5.0)) - p.start() - processes.append(p) - time.sleep(0.05) # Small delay to ensure processes start in order - - # Wait for all processes to complete - for p in processes: - p.join(timeout=10) - - # Collect results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - # Analyze results - acquired_events = [r for r in results if r[0] == 'acquired'] - released_events = [r for r in results if r[0] == 'released'] - completed_events = [r for r in results if r[0] == 'completed'] - - # All 6 workers should complete successfully - assert len(completed_events) == 6, f'Expected 6 completions, got {len(completed_events)}' - - # Sort by acquisition time - acquired_events.sort(key=lambda x: x[2]) - - # Extract worker IDs in order of acquisition - acquisition_order = [event[1] for event in acquired_events] - - # Verify FIFO order - workers should generally acquire in start order - # Allow some flexibility for first batch due to process startup variations - first_batch = acquisition_order[:3] - second_batch = acquisition_order[3:] - - # All first batch workers should have lower IDs than second batch - max_first_batch = max(first_batch) - min_second_batch = min(second_batch) - assert max_first_batch < min_second_batch, ( - f'First batch (workers {first_batch}) should have lower IDs than second batch (workers {second_batch})' - ) - - # Verify semaphore is actually limiting concurrency - # Check that no more than 3 workers held the semaphore simultaneously - active_workers = [] - # Filter out events that don't have timing information - timed_events = [e for e in results if len(e) >= 3 and isinstance(e[2], (int, float))] - for event in sorted(timed_events, key=lambda x: x[2]): # Sort all events by time - if event[0] == 'acquired': - active_workers.append(event[1]) - assert len(active_workers) <= 3, f'Too many workers active: {active_workers}' - elif event[0] == 'released': - if event[1] in active_workers: - active_workers.remove(event[1]) - - def test_semaphore_timeout(self): - """Test that semaphore timeout works correctly.""" - results_queue = multiprocessing.Queue() - start_time = time.time() - processes = [] - - # Start 4 workers with short timeout (semaphore limit is 3) - for i in range(4): - p = multiprocessing.Process( - target=worker_acquire_semaphore, - args=(i, start_time, results_queue, 2.0, 0.5), # 2s hold, 0.5s timeout - ) - p.start() - processes.append(p) - - # Wait for processes - for p in processes: - p.join(timeout=5) - - # Collect results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - # Check that we have timeout events - timeout_events = [r for r in results if r[0] == 'timeout'] - completed_events = [r for r in results if r[0] == 'completed'] - - # 3 should complete, 1 should timeout - assert len(completed_events) == 3, f'Expected 3 completions, got {len(completed_events)}' - assert len(timeout_events) == 1, f'Expected 1 timeout, got {len(timeout_events)}' - - # Verify that timeout occurred before any releases - released_events = [r for r in results if r[0] == 'released'] - if released_events and timeout_events: - min_release_time = min(r[2] for r in released_events) - timeout_time = timeout_events[0][2] - assert timeout_time < min_release_time, ( - f'Timeout should occur before releases. Timeout: {timeout_time:.2f}s, First release: {min_release_time:.2f}s' - ) - - def test_process_death_releases_semaphore(self): - """Test that killing a process releases its semaphore slot.""" - results_queue = multiprocessing.Queue() - start_time = time.time() - - # Start 2 processes that will die (limit is 2) - death_processes = [] - for i in range(2): - p = multiprocessing.Process(target=worker_that_dies, args=(i, start_time, results_queue, 0.3)) - p.start() - death_processes.append(p) - - # Wait a bit for them to acquire - time.sleep(0.5) - - # Now start 2 more processes that should be able to acquire after the first 2 die - normal_processes = [] - for i in range(2, 4): - p = multiprocessing.Process(target=worker_death_test_normal, args=(i, start_time, results_queue)) - p.start() - normal_processes.append(p) - - # Wait for death processes to exit - for p in death_processes: - p.join(timeout=2) - assert p.exitcode == 1, f'Process should have exited with code 1, got {p.exitcode}' - - # Wait for normal processes - for p in normal_processes: - p.join(timeout=10) - assert p.exitcode == 0, 'Process should complete successfully' - - # Collect results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - # Check that processes 2 and 3 were able to acquire - acquired_events = [r for r in results if r[0] == 'acquired'] - completed_events = [r for r in results if r[0] == 'completed' and r[1] >= 2] - - # Should have 4 acquisitions total (2 that died + 2 that completed) - assert len(acquired_events) >= 4, f'Expected at least 4 acquisitions, got {len(acquired_events)}' - - # Processes 2 and 3 should complete - assert len(completed_events) == 2, f'Expected 2 completions from workers 2-3, got {len(completed_events)}' - - @pytest.mark.skip(reason='Flaky test - FIFO ordering is not guaranteed due to process scheduling') - def test_concurrent_acquisition_order(self): - """Test that processes acquire semaphore with fairness.""" - results_queue = multiprocessing.Queue() - start_time = time.time() - processes = [] - - # Start 5 processes with delays to establish clear order (limit is 2) - for i in range(5): - p = multiprocessing.Process( - target=worker_acquire_semaphore, - args=(i, start_time, results_queue, 0.3, 5.0), # 0.3s hold time - ) - p.start() - processes.append(p) - time.sleep(0.1) # 100ms delay between starts to establish clear order - - # Wait for all to complete - for p in processes: - p.join(timeout=10) - - # Collect and analyze results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - acquired_events = [r for r in results if r[0] == 'acquired'] - acquired_events.sort(key=lambda x: x[2]) # Sort by acquisition time - - # Extract worker IDs in order of acquisition - acquisition_order = [event[1] for event in acquired_events] - - # Verify all workers acquired - assert len(acquisition_order) == 5, f'All 5 workers should acquire, got {len(acquisition_order)}' - assert set(acquisition_order) == {0, 1, 2, 3, 4}, f'All workers should acquire: {acquisition_order}' - - # Verify FIFO order is generally maintained - # Workers started earlier should generally acquire earlier - # We check that the average position of early workers is lower than late workers - early_workers = [0, 1, 2] # Started first - late_workers = [3, 4] # Started later - - early_positions = [acquisition_order.index(w) for w in early_workers] - late_positions = [acquisition_order.index(w) for w in late_workers] - - avg_early = sum(early_positions) / len(early_positions) - avg_late = sum(late_positions) / len(late_positions) - - assert avg_early < avg_late, ( - f'Early workers should acquire before late workers on average. ' - f'Early avg position: {avg_early:.1f}, Late avg position: {avg_late:.1f}. ' - f'Order: {acquisition_order}' - ) - - def test_semaphore_persistence_across_runs(self): - """Test that semaphore state persists correctly across process runs.""" - results_queue = multiprocessing.Queue() - start_time = time.time() - - # First run: Start 3 processes that hold semaphore (limit is 3) - first_batch = [] - for i in range(3): - p = multiprocessing.Process( - target=worker_acquire_semaphore, - args=(i, start_time, results_queue, 1.0, 5.0), # Hold for 1 second - ) - p.start() - first_batch.append(p) - - # Wait for them to acquire and ensure all slots are taken - time.sleep(0.5) - - # Try to start one more - should timeout quickly - timeout_worker = multiprocessing.Process( - target=worker_acquire_semaphore, - args=(99, start_time, results_queue, 0.5, 0.3), # Very short timeout - ) - timeout_worker.start() - timeout_worker.join(timeout=2) - - # Wait for first batch to complete - for p in first_batch: - p.join(timeout=5) - - # Now start a new batch - should work immediately - second_batch = [] - for i in range(3, 6): - p = multiprocessing.Process(target=worker_acquire_semaphore, args=(i, start_time, results_queue, 0.2, 5.0)) - p.start() - second_batch.append(p) - - for p in second_batch: - p.join(timeout=5) - - # Analyze results - results = [] - while not results_queue.empty(): - results.append(results_queue.get()) - - timeout_events = [r for r in results if r[0] == 'timeout' and r[1] == 99] - second_batch_acquired = [r for r in results if r[0] == 'acquired' and r[1] >= 3] - - # Worker 99 should timeout - assert len(timeout_events) == 1, 'Worker 99 should timeout' - - # Second batch should all acquire successfully - assert len(second_batch_acquired) == 3, 'All second batch workers should acquire' - - # Verify the second batch acquired after the first batch started releasing - # Get the minimum release time from first batch - first_batch_released = [r for r in results if r[0] == 'released' and r[1] < 3] - if first_batch_released: - min_release_time = min(r[2] for r in first_batch_released) - # At least one second batch worker should have acquired after first release - second_batch_times = [event[2] for event in second_batch_acquired] - assert any(t >= min_release_time - 0.1 for t in second_batch_times), ( - f'Second batch should acquire after first batch releases. ' - f'Min release: {min_release_time:.2f}, Second batch times: {second_batch_times}' - ) - - -class TestRegularSemaphoreScopes: - """Test non-multiprocess semaphore scopes still work correctly.""" - - async def test_global_scope(self): - """Test global scope semaphore.""" - results = [] - - @retry( - retries=0, - timeout=1, - semaphore_limit=2, - semaphore_scope='global', - semaphore_name='test_global', - ) - async def test_func(worker_id: int): - results.append(('start', worker_id, time.time())) - await asyncio.sleep(0.1) - results.append(('end', worker_id, time.time())) - return worker_id - - # Run 4 tasks concurrently (limit is 2) - tasks = [test_func(i) for i in range(4)] - await asyncio.gather(*tasks) - - # Check that only 2 ran concurrently - starts = [r for r in results if r[0] == 'start'] - starts.sort(key=lambda x: x[2]) - - # First 2 should start immediately - assert starts[1][2] - starts[0][2] < 0.05 - - # 3rd should wait for first to finish - assert starts[2][2] - starts[0][2] > 0.08 - - async def test_class_scope(self): - """Test class scope semaphore.""" - - class TestClass: - def __init__(self): - self.results = [] - - @retry( - retries=0, - timeout=1, - semaphore_limit=1, - semaphore_scope='class', - semaphore_name='test_method', - ) - async def test_method(self, worker_id: int): - self.results.append(('start', worker_id, time.time())) - await asyncio.sleep(0.1) - self.results.append(('end', worker_id, time.time())) - return worker_id - - # Create two instances - obj1 = TestClass() - obj2 = TestClass() - - # Run method on both instances concurrently - # They should share the semaphore (class scope) - start_time = time.time() - await asyncio.gather( - obj1.test_method(1), - obj2.test_method(2), - ) - end_time = time.time() - - # Should take ~0.2s (sequential) not ~0.1s (parallel) - assert end_time - start_time > 0.18 - - async def test_self_scope(self): - """Test self scope semaphore.""" - - class TestClass: - def __init__(self): - self.results = [] - - @retry( - retries=0, - timeout=1, - semaphore_limit=1, - semaphore_scope='self', - semaphore_name='test_method', - ) - async def test_method(self, worker_id: int): - self.results.append(('start', worker_id, time.time())) - await asyncio.sleep(0.1) - self.results.append(('end', worker_id, time.time())) - return worker_id - - # Create two instances - obj1 = TestClass() - obj2 = TestClass() - - # Run method on both instances concurrently - # They should NOT share the semaphore (self scope) - start_time = time.time() - await asyncio.gather( - obj1.test_method(1), - obj2.test_method(2), - ) - end_time = time.time() - - # Should take ~0.1s (parallel) not ~0.2s (sequential) - assert end_time - start_time < 0.15 - - -if __name__ == '__main__': - # Run the tests - pytest.main([__file__, '-v']) diff --git a/tests/ci/test_telemetry.py b/tests/ci/test_telemetry.py index 3c12bff12..71867e5bf 100644 --- a/tests/ci/test_telemetry.py +++ b/tests/ci/test_telemetry.py @@ -111,6 +111,8 @@ def test_cli_telemetry_event(): assert 'version' in props assert 'action' in props assert 'mode' in props + assert 'is_docker' in props # Docker context should be included + assert isinstance(props['is_docker'], bool) # Should be a boolean assert 'name' not in props # name should not be in properties @@ -259,6 +261,8 @@ def test_mcp_server_telemetry_event_with_parent_process(): props = event.properties assert 'parent_process_cmdline' in props assert props['parent_process_cmdline'] == 'python -m browser_use.mcp.server' + assert 'is_docker' in props # Docker context should be included + assert isinstance(props['is_docker'], bool) # Should be a boolean def test_telemetry_device_id_uses_config_dir():