Merge branch 'main' into feat/custom-screenshot-quality

This commit is contained in:
Enzo Biondo
2025-07-21 13:27:07 -03:00
committed by GitHub
15 changed files with 634 additions and 218 deletions

View File

@@ -1,4 +1,5 @@
import os
from typing import TYPE_CHECKING
from browser_use.logging_config import setup_logging
@@ -13,21 +14,6 @@ else:
# Monkeypatch BaseSubprocessTransport.__del__ to handle closed event loops gracefully
from asyncio import base_subprocess
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.service import Agent
from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList
from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession
from browser_use.controller.service import Controller
from browser_use.dom.service import DomService
from browser_use.llm import (
ChatAnthropic,
ChatAzureOpenAI,
ChatGoogle,
ChatGroq,
ChatOllama,
ChatOpenAI,
)
_original_del = base_subprocess.BaseSubprocessTransport.__del__
@@ -50,6 +36,71 @@ def _patched_del(self):
base_subprocess.BaseSubprocessTransport.__del__ = _patched_del
# Type stubs for lazy imports - fixes linter warnings
if TYPE_CHECKING:
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.service import Agent
from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList
from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession
from browser_use.controller.service import Controller
from browser_use.dom.service import DomService
from browser_use.llm.anthropic.chat import ChatAnthropic
from browser_use.llm.azure.chat import ChatAzureOpenAI
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.groq.chat import ChatGroq
from browser_use.llm.ollama.chat import ChatOllama
from browser_use.llm.openai.chat import ChatOpenAI
# Lazy imports mapping - only import when actually accessed
_LAZY_IMPORTS = {
# Agent service (heavy due to dependencies)
'Agent': ('browser_use.agent.service', 'Agent'),
# System prompt (moderate weight due to agent.views imports)
'SystemPrompt': ('browser_use.agent.prompts', 'SystemPrompt'),
# Agent views (very heavy - over 1 second!)
'ActionModel': ('browser_use.agent.views', 'ActionModel'),
'ActionResult': ('browser_use.agent.views', 'ActionResult'),
'AgentHistoryList': ('browser_use.agent.views', 'AgentHistoryList'),
# Browser components (heavy due to playwright/patchright)
'Browser': ('browser_use.browser', 'Browser'),
'BrowserConfig': ('browser_use.browser', 'BrowserConfig'),
'BrowserSession': ('browser_use.browser', 'BrowserSession'),
'BrowserProfile': ('browser_use.browser', 'BrowserProfile'),
'BrowserContext': ('browser_use.browser', 'BrowserContext'),
'BrowserContextConfig': ('browser_use.browser', 'BrowserContextConfig'),
# Controller (moderate weight)
'Controller': ('browser_use.controller.service', 'Controller'),
# DOM service (moderate weight)
'DomService': ('browser_use.dom.service', 'DomService'),
# Chat models (very heavy imports)
'ChatOpenAI': ('browser_use.llm.openai.chat', 'ChatOpenAI'),
'ChatGoogle': ('browser_use.llm.google.chat', 'ChatGoogle'),
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'),
}
def __getattr__(name: str):
"""Lazy import mechanism - only import modules when they're actually accessed."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
from importlib import import_module
module = import_module(module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = [
'Agent',
'Browser',

View File

@@ -34,7 +34,8 @@ from bubus import EventBus
from pydantic import ValidationError
from uuid_extensions import uuid7str
from browser_use.agent.gif import create_history_gif
# Lazy import for gif to avoid heavy agent.views import at startup
# from browser_use.agent.gif import create_history_gif
from browser_use.agent.message_manager.service import (
MessageManager,
)
@@ -184,6 +185,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
display_files_in_done_text: bool = True,
include_tool_call_examples: bool = False,
vision_detail_level: Literal['auto', 'low', 'high'] = 'auto',
llm_timeout: int = 60,
step_timeout: int = 180,
**kwargs,
):
# Check for deprecated planner parameters
@@ -261,6 +264,8 @@ class Agent(Generic[Context, AgentStructuredOutput]):
extend_planner_system_message=None, # Always None now (deprecated)
calculate_cost=calculate_cost,
include_tool_call_examples=include_tool_call_examples,
llm_timeout=llm_timeout,
step_timeout=step_timeout,
)
# Token cost service
@@ -280,7 +285,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self._set_browser_use_version_and_source(source)
self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
# Verify we can connect to the LLM and setup the tool calling method
# Verify we can connect to the model
self._verify_and_setup_llm()
# TODO: move this logic to the LLMs
@@ -644,6 +649,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
self.task = new_task
self._message_manager.add_new_task(new_task)
@observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused')
async def _raise_if_stopped_or_paused(self) -> None:
"""Utility function that raises an InterruptedError if the agent is stopped or paused."""
@@ -655,24 +661,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
# self.logger.debug('Agent paused after getting state')
raise InterruptedError
@observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_with_recovery')
async def _get_browser_state_with_recovery(self, cache_clickable_elements_hashes: bool = True) -> BrowserStateSummary:
"""Get browser state with multiple fallback strategies for error recovery"""
assert self.browser_session is not None, 'BrowserSession is not set up'
# Try 1: Full state summary (current implementation) - like main branch
try:
return await self.browser_session.get_state_summary(cache_clickable_elements_hashes)
except Exception as e:
if self.state.last_result is None:
self.state.last_result = []
self.state.last_result.append(ActionResult(error=str(e)))
self.logger.warning(f'Full state retrieval failed: {type(e).__name__}: {e}')
self.logger.warning('🔄 Falling back to minimal state summary')
return await self.browser_session.get_minimal_state_summary()
@observe(name='agent.step', ignore_output=True, ignore_input=True)
@time_execution_async('--step')
async def step(self, step_info: AgentStepInfo | None = None) -> None:
@@ -707,7 +695,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
assert self.browser_session is not None, 'BrowserSession is not set up'
self.logger.debug(f'🌐 Step {self.state.n_steps + 1}: Getting browser state...')
browser_state_summary = await self._get_browser_state_with_recovery(cache_clickable_elements_hashes=True)
browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision
)
current_page = await self.browser_session.get_current_page()
# Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads)
@@ -744,6 +734,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
await self._handle_final_step(step_info)
return browser_state_summary
@observe_debug(ignore_input=True, name='get_next_action')
async def _get_next_action(self, browser_state_summary: BrowserStateSummary) -> None:
"""Execute LLM interaction with retry logic and handle callbacks"""
input_messages = self._message_manager.get_messages()
@@ -751,7 +742,15 @@ class Agent(Generic[Context, AgentStructuredOutput]):
f'🤖 Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...'
)
model_output = await self._get_model_output_with_retry(input_messages)
try:
model_output = await asyncio.wait_for(
self._get_model_output_with_retry(input_messages), timeout=self.settings.llm_timeout
)
except TimeoutError:
raise TimeoutError(
f'LLM call timed out after {self.settings.llm_timeout} seconds. Keep your thinking and output short.'
)
self.state.last_model_output = model_output
# Check again for paused/stopped state after getting model output
@@ -988,6 +987,7 @@ class Agent(Generic[Context, AgentStructuredOutput]):
return text.strip()
@time_execution_async('--get_next_action')
@observe_debug(ignore_input=True, ignore_output=True, name='get_model_output')
async def get_model_output(self, input_messages: list[BaseMessage]) -> AgentOutput:
"""Get next action from LLM based on current state"""
@@ -1249,15 +1249,15 @@ class Agent(Generic[Context, AgentStructuredOutput]):
try:
await asyncio.wait_for(
self.step(step_info),
timeout=300, # 5 minute step timeout - more generous for slow LLM calls
timeout=self.settings.step_timeout,
)
self.logger.debug(f'✅ Completed step {step + 1}/{max_steps}')
except TimeoutError:
# Handle step timeout gracefully
error_msg = f'Step {step + 1} timed out after 300 seconds'
error_msg = f'Step {step + 1} timed out after {self.settings.step_timeout} seconds'
self.logger.error(f'{error_msg}')
self.state.consecutive_failures += 1
self.state.last_result = [ActionResult(error=error_msg, include_in_memory=True)]
self.state.last_result = [ActionResult(error=error_msg)]
if on_step_end is not None:
await on_step_end(self)
@@ -1347,6 +1347,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if isinstance(self.settings.generate_gif, str):
output_path = self.settings.generate_gif
# Lazy import gif module to avoid heavy startup cost
from browser_use.agent.gif import create_history_gif
create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
# Emit output file generated event for GIF
@@ -1381,56 +1384,63 @@ class Agent(Generic[Context, AgentStructuredOutput]):
results: list[ActionResult] = []
assert self.browser_session is not None, 'BrowserSession is not set up'
cached_selector_map = await self.browser_session.get_selector_map()
cached_path_hashes = {e.hash.branch_path_hash for e in cached_selector_map.values()}
try:
await self.browser_session.remove_highlights()
except TimeoutError:
# we don't care if this times out
self.logger.debug('Timeout to remove highlights')
for i, action in enumerate(actions):
# DO NOT ALLOW TO CALL `done` AS A SINGLE ACTION
if i > 0 and action.model_dump(exclude_unset=True).get('done') is not None:
msg = f'Done action is allowed only as a single action - stopped after action {i} / {len(actions)}.'
logger.info(msg)
cached_selector_map = {}
cached_path_hashes = set()
# check all actions if any has index, if so, get the selector map
for action in actions:
if action.get_index() is not None:
cached_selector_map = await self.browser_session.get_selector_map()
cached_path_hashes = {e.hash.branch_path_hash for e in cached_selector_map.values()}
break
if action.get_index() is not None and i != 0:
new_browser_state_summary = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False)
new_selector_map = new_browser_state_summary.selector_map
# Detect index change after previous action
orig_target = cached_selector_map.get(action.get_index()) # type: ignore
orig_target_hash = orig_target.hash.branch_path_hash if orig_target else None
new_target = new_selector_map.get(action.get_index()) # type: ignore
new_target_hash = new_target.hash.branch_path_hash if new_target else None
if orig_target_hash != new_target_hash:
msg = f'Element index changed after action {i} / {len(actions)}, because page changed.'
# loop over actions and execute them
for i, action in enumerate(actions):
if i > 0:
# ONLY ALLOW TO CALL `done` IF IT IS A SINGLE ACTION
if action.model_dump(exclude_unset=True).get('done') is not None:
msg = f'Done action is allowed only as a single action - stopped after action {i} / {len(actions)}.'
logger.info(msg)
results.append(
ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=msg,
)
)
break
new_path_hashes = {e.hash.branch_path_hash for e in new_selector_map.values()}
if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes):
# next action requires index but there are new elements on the page
msg = f'Something new appeared after action {i} / {len(actions)}, following actions are NOT executed and should be retried.'
logger.info(msg)
results.append(
ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=msg,
)
if action.get_index() is not None:
new_browser_state_summary = await self.browser_session.get_browser_state_with_recovery(
cache_clickable_elements_hashes=False, include_screenshot=False
)
break
new_selector_map = new_browser_state_summary.selector_map
# Detect index change after previous action
orig_target = cached_selector_map.get(action.get_index()) # type: ignore
orig_target_hash = orig_target.hash.branch_path_hash if orig_target else None
new_target = new_selector_map.get(action.get_index()) # type: ignore
new_target_hash = new_target.hash.branch_path_hash if new_target else None
if orig_target_hash != new_target_hash:
msg = f'Element index changed after action {i} / {len(actions)}, because page changed.'
logger.info(msg)
results.append(
ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=msg,
)
)
break
new_path_hashes = {e.hash.branch_path_hash for e in new_selector_map.values()}
if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes):
# next action requires index but there are new elements on the page
msg = f'Something new appeared after action {i} / {len(actions)}, following actions are NOT executed and should be retried.'
logger.info(msg)
results.append(
ActionResult(
extracted_content=msg,
include_in_memory=True,
long_term_memory=msg,
)
)
break
# wait between actions
await asyncio.sleep(self.browser_profile.wait_between_actions)
try:
await self._raise_if_stopped_or_paused()
@@ -1455,9 +1465,6 @@ class Agent(Generic[Context, AgentStructuredOutput]):
if results[-1].is_done or results[-1].error or i == len(actions) - 1:
break
await asyncio.sleep(self.browser_profile.wait_between_actions)
# hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
except Exception as e:
# Handle any exceptions during action execution
self.logger.error(f'Action {i + 1} failed: {type(e).__name__}: {e}')
@@ -1535,7 +1542,9 @@ class Agent(Generic[Context, AgentStructuredOutput]):
async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
"""Execute a single step from history with element validation"""
assert self.browser_session is not None, 'BrowserSession is not set up'
state = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False)
state = await self.browser_session.get_browser_state_with_recovery(
cache_clickable_elements_hashes=False, include_screenshot=False
)
if not state or not history_item.model_output:
raise ValueError('Invalid state or model output')
updated_actions = []

View File

@@ -65,6 +65,8 @@ class AgentSettings(BaseModel):
extend_planner_system_message: str | None = None
calculate_cost: bool = False
include_tool_call_examples: bool = False
llm_timeout: int = 60 # Timeout in seconds for LLM calls
step_timeout: int = 180 # Timeout in seconds for each step
class AgentState(BaseModel):

View File

@@ -1,6 +1,41 @@
from .browser import Browser, BrowserConfig
from .context import BrowserContext, BrowserContextConfig
from .profile import BrowserProfile
from .session import BrowserSession
from typing import TYPE_CHECKING
# Type stubs for lazy imports
if TYPE_CHECKING:
from .browser import Browser, BrowserConfig
from .context import BrowserContext, BrowserContextConfig
from .profile import BrowserProfile
from .session import BrowserSession
# Lazy imports mapping for heavy browser components
_LAZY_IMPORTS = {
'Browser': ('.browser', 'Browser'),
'BrowserConfig': ('.browser', 'BrowserConfig'),
'BrowserContext': ('.context', 'BrowserContext'),
'BrowserContextConfig': ('.context', 'BrowserContextConfig'),
'BrowserProfile': ('.profile', 'BrowserProfile'),
'BrowserSession': ('.session', 'BrowserSession'),
}
def __getattr__(name: str):
"""Lazy import mechanism for heavy browser components."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
from importlib import import_module
# Use relative import for current package
full_module_path = f'browser_use.browser{module_path}'
module = import_module(full_module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {full_module_path}: {e}') from e
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig', 'BrowserSession', 'BrowserProfile']

View File

@@ -169,6 +169,10 @@ CHROME_DEFAULT_ARGS = [
'--disable-desktop-notifications',
'--noerrdialogs',
'--silent-debugger-extension-api',
# Extension welcome tab suppression for automation
'--disable-extensions-http-throttling',
'--extensions-on-chrome-urls',
'--disable-default-apps',
f'--disable-features={",".join(CHROME_DISABLED_COMPONENTS)}',
]
@@ -558,6 +562,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
description='List of allowed domains for navigation e.g. ["*.google.com", "https://example.com", "chrome-extension://*"]',
)
keep_alive: bool | None = Field(default=None, description='Keep browser alive after agent run.')
enable_default_extensions: bool = Field(
default=True,
description="Enable automation-optimized extensions: ad blocking (uBlock Origin), cookie handling (I still don't care about cookies), and URL cleaning (ClearURLs). All extensions work automatically without manual intervention. Extensions are automatically downloaded and loaded when enabled.",
)
window_size: ViewportSize | None = Field(
default=None,
description='Browser window size to use when headless=False.',
@@ -620,6 +628,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
window_size['width'] = window_size['width'] or self.window_width or 1280
window_size['height'] = window_size['height'] or self.window_height or 1100
self.window_size = window_size
return self
@model_validator(mode='after')
@@ -699,12 +708,162 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
if self.window_position
else []
),
*(self._get_extension_args() if self.enable_default_extensions else []),
]
# convert to dict and back to dedupe and merge duplicate args
final_args_list = BrowserLaunchArgs.args_as_list(BrowserLaunchArgs.args_as_dict(pre_conversion_args))
return final_args_list
def _get_extension_args(self) -> list[str]:
"""Get Chrome args for enabling default extensions (ad blocker and cookie handler)."""
extension_paths = self._ensure_default_extensions_downloaded()
args = [
'--enable-extensions',
'--disable-extensions-file-access-check',
'--disable-extensions-http-throttling',
'--enable-extension-activity-logging',
]
if extension_paths:
args.append(f'--load-extension={",".join(extension_paths)}')
return args
def _ensure_default_extensions_downloaded(self) -> list[str]:
"""
Ensure default extensions are downloaded and cached locally.
Returns list of paths to extension directories.
"""
from pathlib import Path
# Extension definitions - optimized for automation and content extraction
extensions = [
{
'name': 'uBlock Origin',
'id': 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dcjpalhdlnbpafiamejdnhcphjbkeiagm%26uc',
},
{
'name': "I still don't care about cookies",
'id': 'edibdbjcniadpccecjdfdjjppcpchdlm',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dedibdbjcniadpccecjdfdjjppcpchdlm%26uc',
},
{
'name': 'ClearURLs',
'id': 'lckanjgmijmafbedllaakclkaicjfmnk',
'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dlckanjgmijmafbedllaakclkaicjfmnk%26uc',
},
]
# Create extensions cache directory
cache_dir = Path.home() / '.browser-use' / 'extensions'
cache_dir.mkdir(parents=True, exist_ok=True)
extension_paths = []
loaded_extension_names = []
for ext in extensions:
ext_dir = cache_dir / ext['id']
crx_file = cache_dir / f'{ext["id"]}.crx'
# Check if extension is already extracted
if ext_dir.exists() and (ext_dir / 'manifest.json').exists():
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
continue
try:
# Download extension if not cached
if not crx_file.exists():
logger.info(f'📦 Downloading {ext["name"]} extension...')
self._download_extension(ext['url'], crx_file)
# Extract extension
if crx_file.exists():
logger.info(f'📂 Extracting {ext["name"]} extension...')
self._extract_extension(crx_file, ext_dir)
extension_paths.append(str(ext_dir))
loaded_extension_names.append(ext['name'])
except Exception as e:
logger.warning(f'⚠️ Failed to setup {ext["name"]} extension: {e}')
continue
if extension_paths:
logger.info(f'✅ Extensions ready: {len(extension_paths)} extensions loaded ({", ".join(loaded_extension_names)})')
else:
logger.warning('⚠️ No default extensions could be loaded')
return extension_paths
def _download_extension(self, url: str, output_path: Path) -> None:
"""Download extension .crx file."""
import urllib.request
try:
with urllib.request.urlopen(url) as response:
with open(output_path, 'wb') as f:
f.write(response.read())
except Exception as e:
raise Exception(f'Failed to download extension: {e}')
def _extract_extension(self, crx_path: Path, extract_dir: Path) -> None:
"""Extract .crx file to directory."""
import os
import zipfile
# Remove existing directory
if extract_dir.exists():
import shutil
shutil.rmtree(extract_dir)
extract_dir.mkdir(parents=True, exist_ok=True)
try:
# CRX files are ZIP files with a header, try to extract as ZIP
with zipfile.ZipFile(crx_path, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
# Verify manifest exists
if not (extract_dir / 'manifest.json').exists():
raise Exception('No manifest.json found in extension')
except zipfile.BadZipFile:
# CRX files have a header before the ZIP data
# Skip the CRX header and extract the ZIP part
with open(crx_path, 'rb') as f:
# Read CRX header to find ZIP start
magic = f.read(4)
if magic != b'Cr24':
raise Exception('Invalid CRX file format')
version = int.from_bytes(f.read(4), 'little')
if version == 2:
pubkey_len = int.from_bytes(f.read(4), 'little')
sig_len = int.from_bytes(f.read(4), 'little')
f.seek(16 + pubkey_len + sig_len) # Skip to ZIP data
elif version == 3:
header_len = int.from_bytes(f.read(4), 'little')
f.seek(12 + header_len) # Skip to ZIP data
# Extract ZIP data
zip_data = f.read()
# Write ZIP data to temp file and extract
import tempfile
with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip:
temp_zip.write(zip_data)
temp_zip.flush()
with zipfile.ZipFile(temp_zip.name, 'r') as zip_ref:
zip_ref.extractall(extract_dir)
os.unlink(temp_zip.name)
def kwargs_for_launch_persistent_context(self) -> BrowserLaunchPersistentContextArgs:
"""Return the kwargs for BrowserType.launch()."""
return BrowserLaunchPersistentContextArgs(**self.model_dump(exclude={'args'}), args=self.get_args())
@@ -721,22 +880,6 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro
"""Return the kwargs for BrowserType.connect_over_cdp()."""
return BrowserLaunchArgs(**self.model_dump(exclude={'args'}), args=self.get_args())
# def preinstall_extensions(self) -> None:
# """Preinstall the extensions."""
# # create the local unpacked extensions dir
# extensions_dir = self.user_data_dir / 'Extensions'
# extensions_dir.mkdir(parents=True, exist_ok=True)
# # download from the chrome web store using the chrome web store api
# for extension_id in self.extension_ids_to_preinstall:
# extension_path = extensions_dir / f'{extension_id}.crx'
# if extension_path.exists():
# logger.warning(f'⚠️ Extension {extension_id} is already installed, skipping preinstall.')
# else:
# logger.info(f'🔍 Preinstalling extension {extension_id}...')
# # TODO: copy this from ArchiveBox implementation
@observe_debug(ignore_input=True, ignore_output=True, name='detect_display_configuration')
def detect_display_configuration(self) -> None:
"""

View File

@@ -51,8 +51,10 @@ from browser_use.browser.views import (
TabInfo,
URLNotAllowedError,
)
from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor
from browser_use.dom.service import DomService
# Lazy imports for heavy DOM services to improve startup time
# from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor
# from browser_use.dom.service import DomService
from browser_use.dom.views import DOMElementNode, SelectorMap
from browser_use.utils import (
is_new_tab_page,
@@ -160,12 +162,14 @@ def require_healthy_browser(usable_page=True, reopen_page=True):
await self._recover_unresponsive_page(
func.__name__, timeout_ms=int(self.browser_profile.default_navigation_timeout or 5000) + 5_000
)
page_url = self.agent_current_page.url if self.agent_current_page else 'unknown page'
self.logger.debug(
f'🤕 Crashed page recovery finished, attempting to continue with {func.__name__}() on {_log_pretty_url(self.agent_current_page.url)}...'
f'🤕 Crashed page recovery finished, attempting to continue with {func.__name__}() on {_log_pretty_url(page_url)}...'
)
except Exception as e:
page_url = self.agent_current_page.url if self.agent_current_page else 'unknown page'
self.logger.warning(
f'❌ Crashed page recovery failed, could not run {func.__name__}(), page is stuck unresponsive on {_log_pretty_url(self.agent_current_page.url)}...'
f'❌ Crashed page recovery failed, could not run {func.__name__}(), page is stuck unresponsive on {_log_pretty_url(page_url)}...'
)
raise # Re-raise to let retry decorator / callsite handle it
@@ -384,10 +388,19 @@ class BrowserSession(BaseModel):
# Ensure we have a context
assert self.browser_context, f'Failed to create BrowserContext for browser={self.browser}'
# Configure browser
await self._setup_viewports()
await self._setup_current_page_change_listeners()
await self._start_context_tracing()
# Configure browser - run some setup tasks in parallel for speed
setup_results = await asyncio.gather(
self._setup_viewports(),
self._setup_current_page_change_listeners(),
self._start_context_tracing(),
return_exceptions=True,
)
# Check for exceptions in setup results
for i, result in enumerate(setup_results):
if isinstance(result, Exception):
setup_task_names = ['_setup_viewports', '_setup_current_page_change_listeners', '_start_context_tracing']
raise Exception(f'Browser setup failed in {setup_task_names[i]}: {result}') from result
self.initialized = True
return self
@@ -837,6 +850,7 @@ class BrowserSession(BaseModel):
atexit.register(shudown_playwright)
@observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_passed_objects')
async def setup_browser_via_passed_objects(self) -> None:
"""Override to customize the set up of the connection to an existing browser"""
@@ -878,6 +892,7 @@ class BrowserSession(BaseModel):
self.logger.info(f'🎭 Connected to existing user-provided browser: {self.browser_context}')
self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end
@observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_browser_pid')
async def setup_browser_via_browser_pid(self) -> None:
"""if browser_pid is provided, calcuclate its CDP URL by looking for --remote-debugging-port=... in its CLI args, then connect to it"""
@@ -922,11 +937,10 @@ class BrowserSession(BaseModel):
# Wait for CDP port to become available (Chrome might still be starting)
import httpx
# Add initial delay to give Chrome time to start up before first check
await asyncio.sleep(2)
# No initial sleep needed - the polling loop below handles waiting if Chrome isn't ready yet
async with httpx.AsyncClient() as client:
for i in range(30): # 30 second timeout
for i in range(30): # timeout
# First check if the Chrome process has exited
try:
chrome_process = psutil.Process(pid=self.browser_pid)
@@ -988,7 +1002,7 @@ class BrowserSession(BaseModel):
except (httpx.ConnectError, httpx.TimeoutException):
if i == 0:
self.logger.debug(f'⏳ Waiting for Chrome CDP port {debug_port} to become available...')
await asyncio.sleep(1)
await asyncio.sleep(0.5)
else:
self.logger.error(f'❌ Chrome CDP port {debug_port} did not become available after 30 seconds')
self.browser_pid = None
@@ -1010,6 +1024,7 @@ class BrowserSession(BaseModel):
)
self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end
@observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_wss_url')
async def setup_browser_via_wss_url(self) -> None:
"""check for a passed wss_url, connect to a remote playwright browser server via WSS"""
@@ -1044,7 +1059,8 @@ class BrowserSession(BaseModel):
)
self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end
@retry(wait=1, retries=2, timeout=45, semaphore_limit=1, semaphore_scope='self', semaphore_lax=False)
@observe_debug(ignore_input=True, ignore_output=True, name='setup_new_browser_context')
@retry(wait=0.1, retries=5, timeout=45, semaphore_limit=1, semaphore_scope='self', semaphore_lax=False)
async def setup_new_browser_context(self) -> None:
"""Launch a new browser and browser_context"""
# Double-check after semaphore acquisition to prevent duplicate browser launches
@@ -1059,6 +1075,7 @@ class BrowserSession(BaseModel):
pass
await self._unsafe_setup_new_browser_context()
@observe_debug(ignore_input=True, ignore_output=True, name='_unsafe_setup_new_browser_context')
async def _unsafe_setup_new_browser_context(self) -> None:
"""Unsafe browser context setup without retry protection."""
@@ -2015,7 +2032,6 @@ class BrowserSession(BaseModel):
await page.wait_for_selector(selector, state='visible', timeout=timeout)
@observe_debug(name='remove_highlights', ignore_output=True, ignore_input=True)
@require_healthy_browser(usable_page=True, reopen_page=True)
@time_execution_async('--remove_highlights')
@retry(timeout=2, retries=0)
async def remove_highlights(self):
@@ -2048,14 +2064,16 @@ class BrowserSession(BaseModel):
self.logger.debug(f'⚠️ Failed to remove highlights (this is usually ok): {type(e).__name__}: {e}')
# Don't raise the error since this is not critical functionality
@observe_debug(ignore_output=True, name='get_dom_element_by_index')
@require_healthy_browser(usable_page=True, reopen_page=True)
async def get_dom_element_by_index(self, index: int) -> DOMElementNode | None:
"""Get DOM element by index."""
selector_map = await self.get_selector_map()
return selector_map.get(index)
@require_healthy_browser(usable_page=True, reopen_page=True)
@time_execution_async('--click_element_node')
@observe_debug(ignore_input=True, name='click_element_node')
@require_healthy_browser(usable_page=True, reopen_page=True)
async def _click_element_node(self, element_node: DOMElementNode) -> str | None:
"""
Optimized method to click an element using xpath.
@@ -2069,7 +2087,8 @@ class BrowserSession(BaseModel):
element_handle = await self.get_locate_element(element_node)
if element_handle is None:
raise Exception(f'Element: {repr(element_node)} not found')
self.logger.debug(f'Element: {repr(element_node)} not found')
raise Exception('Element not found')
async def perform_click(click_func):
"""Performs the actual click, handling both download and navigation scenarios."""
@@ -2163,10 +2182,10 @@ class BrowserSession(BaseModel):
except URLNotAllowedError as e:
raise e
except Exception as e:
raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
raise Exception(f'Failed to click element. Error: {str(e)}')
@time_execution_async('--get_tabs_info')
@retry(timeout=6, retries=1)
@retry(timeout=3, retries=1)
@require_healthy_browser(usable_page=False, reopen_page=False)
async def get_tabs_info(self) -> list[TabInfo]:
"""Get information about all tabs"""
@@ -2174,7 +2193,7 @@ class BrowserSession(BaseModel):
tabs_info = []
for page_id, page in enumerate(self.browser_context.pages):
try:
title = await asyncio.wait_for(page.title(), timeout=3.0)
title = await asyncio.wait_for(page.title(), timeout=2.0)
tab_info = TabInfo(page_id=page_id, url=page.url, title=title)
except Exception:
# page.title() can hang forever on tabs that are crashed/disappeared/about:blank
@@ -2255,8 +2274,14 @@ class BrowserSession(BaseModel):
# Check if URL is allowed
if not self._is_url_allowed(normalized_url):
raise BrowserError(f'⛔️ Navigation to non-allowed URL: {normalized_url}')
timeout_ms = min(3000, int(timeout_ms or self.browser_profile.default_navigation_timeout or 12000))
# If timeout_ms is not None, use it (even if 0); else try profile.default_navigation_timeout (even if 0); else 12000
if timeout_ms is not None:
user_timeout_ms = int(timeout_ms)
elif self.browser_profile.default_navigation_timeout is not None:
user_timeout_ms = int(self.browser_profile.default_navigation_timeout)
else:
user_timeout_ms = 12000
timeout_ms = min(3000, user_timeout_ms)
# Handle new tab creation
if new_tab:
@@ -2279,7 +2304,7 @@ class BrowserSession(BaseModel):
# Navigate to URL
try:
# Use asyncio.wait to prevent hanging on slow page loads
# Use asyncio.wait to prevent hanging on a slow page loads
# Don't cap the timeout - respect what was requested
self.logger.debug(f'🧭 Starting navigation to {_log_pretty_url(normalized_url)} with timeout {timeout_ms}ms')
nav_task = asyncio.create_task(page.goto(normalized_url, wait_until='load', timeout=timeout_ms))
@@ -2797,15 +2822,27 @@ class BrowserSession(BaseModel):
@observe_debug(ignore_input=True, ignore_output=True, name='wait_for_page_and_frames_load')
async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None):
"""
Ensures page is fully loaded before continuing.
Waits for either network to be idle or minimum WAIT_TIME, whichever is longer.
Ensures page is fully loaded and stable before continuing.
Waits for network idle, DOM stability, and minimum WAIT_TIME.
Also checks if the loaded URL is allowed.
Parameters:
-----------
timeout_overwrite: float | None
Override the minimum wait time
"""
# Start timing
start_time = time.time()
# Wait for page load
page = await self.get_current_page()
# Skip network waiting for new tab pages (about:blank, chrome://new-tab-page, etc.)
# These pages load instantly and don't need network idle time
if is_new_tab_page(page.url):
self.logger.debug(f'⚡ Skipping page load wait for new tab page: {page.url}')
return
try:
await self._wait_for_stable_network()
@@ -3052,7 +3089,9 @@ class BrowserSession(BaseModel):
@observe_debug(ignore_input=True, ignore_output=True)
@time_execution_async('--get_state_summary')
@require_healthy_browser(usable_page=True, reopen_page=True)
async def get_state_summary(self, cache_clickable_elements_hashes: bool) -> BrowserStateSummary:
async def get_state_summary(
self, cache_clickable_elements_hashes: bool, include_screenshot: bool = True
) -> BrowserStateSummary:
self.logger.debug('🔄 Starting get_state_summary...')
"""Get a summary of the current browser state
@@ -3065,13 +3104,19 @@ class BrowserSession(BaseModel):
If True, cache the clickable elements hashes for the current state.
This is used to calculate which elements are new to the LLM since the last message,
which helps reduce token usage.
include_screenshot: bool
If True, include screenshot in the state summary. Set to False to improve performance
when screenshots are not needed (e.g., in multi_act element validation).
"""
await self._wait_for_page_and_frames_load()
updated_state = await self._get_updated_state()
updated_state = await self._get_updated_state(include_screenshot=include_screenshot)
# Find out which elements are new
# Do this only if url has not changed
if cache_clickable_elements_hashes:
# Lazy import heavy DOM service
from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor
# if we are on the same url as the last state, we can use the cached hashes
if self._cached_clickable_element_hashes and self._cached_clickable_element_hashes.url == updated_state.url:
# Pointers, feel free to edit in place
@@ -3142,20 +3187,12 @@ class BrowserSession(BaseModel):
)
@observe_debug(ignore_input=True, ignore_output=True, name='get_updated_state')
async def _get_updated_state(self, focus_element: int = -1) -> BrowserStateSummary:
async def _get_updated_state(self, focus_element: int = -1, include_screenshot: bool = True) -> BrowserStateSummary:
"""Update and return state."""
# Check if current page is still valid, if not switch to another available page
page = await self.get_current_page()
try:
# Test if page is still accessible
# NOTE: This also happens on invalid urls like www.sadfdsafdssdafd.com
await asyncio.wait_for(page.evaluate('1'), timeout=2.5)
except Exception as e:
self.logger.debug(f'👋 Current page is not accessible: {type(e).__name__}: {e}')
raise BrowserError('Page is not accessible')
try:
self.logger.debug('🧹 Removing highlights...')
try:
@@ -3172,6 +3209,8 @@ class BrowserSession(BaseModel):
self.logger.debug(f'PDF auto-download check failed: {type(e).__name__}: {e}')
self.logger.debug('🌳 Starting DOM processing...')
from browser_use.dom.service import DomService
dom_service = DomService(page, logger=self.logger)
try:
content = await asyncio.wait_for(
@@ -3228,13 +3267,16 @@ class BrowserSession(BaseModel):
# )
# )
try:
self.logger.debug('📸 Capturing screenshot...')
# Reasonable timeout for screenshot
screenshot_b64 = await self.take_screenshot()
# self.logger.debug('✅ Screenshot completed')
except Exception as e:
self.logger.warning(f'❌ Screenshot failed for {_log_pretty_url(page.url)}: {type(e).__name__} {e}')
if include_screenshot:
try:
self.logger.debug('📸 Capturing screenshot...')
# Reasonable timeout for screenshot
screenshot_b64 = await self.take_screenshot()
# self.logger.debug('✅ Screenshot completed')
except Exception as e:
self.logger.warning(f'❌ Screenshot failed for {_log_pretty_url(page.url)}: {type(e).__name__} {e}')
screenshot_b64 = None
else:
screenshot_b64 = None
# Get comprehensive page information
@@ -3475,6 +3517,7 @@ class BrowserSession(BaseModel):
'Browser is unable to load any new about:blank pages (something is very wrong or browser is extremely overloaded)'
)
@observe_debug(ignore_input=True, name='recover_unresponsive_page')
async def _recover_unresponsive_page(self, calling_method: str, timeout_ms: int | None = None) -> None:
"""Recover from an unresponsive page by closing and reopening it."""
self.logger.warning(f'⚠️ Page JS engine became unresponsive in {calling_method}(), attempting recovery...')
@@ -3828,6 +3871,7 @@ class BrowserSession(BaseModel):
@require_healthy_browser(usable_page=True, reopen_page=True)
@time_execution_async('--get_locate_element')
@observe_debug(ignore_input=True, name='get_locate_element')
async def get_locate_element(self, element: DOMElementNode) -> ElementHandle | None:
page = await self.get_current_page()
current_frame = page
@@ -3881,7 +3925,7 @@ class BrowserSession(BaseModel):
if element_handle:
is_visible = await self._is_visible(element_handle)
if is_visible:
await element_handle.scroll_into_view_if_needed()
await element_handle.scroll_into_view_if_needed(timeout=1_000)
return element_handle
return None
except Exception as e:
@@ -3897,7 +3941,7 @@ class BrowserSession(BaseModel):
if element_handle:
is_visible = await self._is_visible(element_handle)
if is_visible:
await element_handle.scroll_into_view_if_needed()
await element_handle.scroll_into_view_if_needed(timeout=1_000)
return element_handle
except Exception as xpath_e:
self.logger.error(
@@ -3924,7 +3968,7 @@ class BrowserSession(BaseModel):
if element_handle:
is_visible = await self._is_visible(element_handle)
if is_visible:
await element_handle.scroll_into_view_if_needed()
await element_handle.scroll_into_view_if_needed(timeout=1_000)
return element_handle
return None
except Exception as e:
@@ -3945,7 +3989,7 @@ class BrowserSession(BaseModel):
if element_handle:
is_visible = await self._is_visible(element_handle)
if is_visible:
await element_handle.scroll_into_view_if_needed()
await element_handle.scroll_into_view_if_needed(timeout=1_000)
return element_handle
return None
except Exception as e:
@@ -3989,7 +4033,7 @@ class BrowserSession(BaseModel):
is_visible = await self._is_visible(element_handle)
if is_visible:
await element_handle.scroll_into_view_if_needed()
await element_handle.scroll_into_view_if_needed(timeout=1_000)
return element_handle
except Exception as e:
self.logger.error(
@@ -3999,6 +4043,7 @@ class BrowserSession(BaseModel):
@require_healthy_browser(usable_page=True, reopen_page=True)
@time_execution_async('--input_text_element_node')
@observe_debug(ignore_input=True, name='input_text_element_node')
async def _input_text_element_node(self, element_node: DOMElementNode, text: str):
"""
Input text into an element with proper error handling and state management.
@@ -4022,7 +4067,7 @@ class BrowserSession(BaseModel):
# let's first try to click and type
try:
await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
await element_handle.click()
await element_handle.click(timeout=2_000) # Add 2 second timeout
await asyncio.sleep(0.1) # Increased sleep time
page = await self.get_current_page()
await page.keyboard.type(text)
@@ -4044,9 +4089,9 @@ class BrowserSession(BaseModel):
try:
if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}')
await element_handle.type(text, delay=5)
await element_handle.type(text, delay=5, timeout=5_000) # Add 5 second timeout
else:
await element_handle.fill(text)
await element_handle.fill(text, timeout=3_000) # Add 3 second timeout
except Exception as e:
self.logger.error(f'Error during input text into element: {type(e).__name__}: {e}')
raise BrowserError(f'Failed to input text into element: {repr(element_node)}')
@@ -4471,32 +4516,29 @@ class BrowserSession(BaseModel):
except Exception as e:
self.logger.debug(f'❌ Failed to show 📀 DVD loading animation: {type(e).__name__}: {e}')
@observe_debug(ignore_input=True, ignore_output=True, name='get_state_summary_with_fallback')
@require_healthy_browser(usable_page=True, reopen_page=True)
@time_execution_async('--get_state_summary_with_fallback')
async def get_state_summary_with_fallback(self, cache_clickable_elements_hashes: bool = True) -> BrowserStateSummary:
"""Get browser state with fallback to minimal state on errors
This method first tries to get a full state summary. If that fails,
it falls back to a minimal state summary to allow basic navigation.
@observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_with_recovery')
async def get_browser_state_with_recovery(
self, cache_clickable_elements_hashes: bool = True, include_screenshot: bool = True
) -> BrowserStateSummary:
"""Get browser state with multiple fallback strategies for error recovery
Parameters:
-----------
cache_clickable_elements_hashes: bool
If True, cache the clickable elements hashes for the current state.
Returns:
--------
BrowserStateSummary: Either full state or minimal fallback state
include_screenshot: bool
If True, include screenshot in the state summary. Set to False to improve performance
when screenshots are not needed (e.g., in multi_act element validation).
"""
# Try 1: Full state summary (current implementation)
# Try 1: Full state summary (current implementation) - like main branch
try:
return await self.get_state_summary(cache_clickable_elements_hashes)
await self._wait_for_page_and_frames_load()
return await self.get_state_summary(cache_clickable_elements_hashes, include_screenshot=include_screenshot)
except Exception as e:
self.logger.warning(f'Full state retrieval failed: {type(e).__name__}: {e}')
self.logger.warning('🔄 Falling back to minimal state summary')
# Try 2: Minimal state summary as fallback
self.logger.warning('🔄 Falling back to minimal state summary')
return await self.get_minimal_state_summary()
async def _is_pdf_viewer(self, page: Page) -> bool:

View File

@@ -130,23 +130,20 @@ class Controller(Generic[Context]):
await browser_session.go_back()
msg = '🔙 Navigated back'
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory='Navigated back')
return ActionResult(extracted_content=msg)
# wait for x seconds
@self.registry.action('Wait for x seconds default 3 (max 10 seconds)')
@self.registry.action(
'Wait for x seconds default 3 (max 10 seconds). This can be used to wait until the page is fully loaded.'
)
async def wait(seconds: int = 3):
# Cap wait time at maximum 10 seconds
actual_seconds = min(max(seconds, 0), 10)
if actual_seconds != seconds:
msg = f'🕒 Waiting for {actual_seconds} seconds (capped from {seconds} seconds, max 10 seconds)'
else:
msg = f'🕒 Waiting for {actual_seconds} seconds'
# Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds
# So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds
actual_seconds = min(max(seconds - 3, 0), 10)
msg = f'🕒 Waiting for {actual_seconds + 3} seconds'
logger.info(msg)
await asyncio.sleep(actual_seconds)
return ActionResult(
extracted_content=msg, include_in_memory=True, long_term_memory=f'Waited for {actual_seconds} seconds'
)
return ActionResult(extracted_content=msg)
# Element Interaction Actions

View File

@@ -15,6 +15,7 @@ from browser_use.dom.views import (
SelectorMap,
ViewportInfo,
)
from browser_use.observability import observe_debug
from browser_use.utils import is_new_tab_page, time_execution_async
# @dataclass
@@ -34,6 +35,7 @@ class DomService:
self.js_code = resources.files('browser_use.dom.dom_tree').joinpath('index.js').read_text()
# region - Clickable elements
@observe_debug(ignore_input=True, ignore_output=True, name='get_clickable_elements')
@time_execution_async('--get_clickable_elements')
async def get_clickable_elements(
self,

View File

@@ -4,14 +4,10 @@ We have switched all of our code from langchain to openai.types.chat.chat_comple
For easier transition we have
"""
from browser_use.llm.anthropic.chat import ChatAnthropic
from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
from browser_use.llm.azure.chat import ChatAzureOpenAI
from typing import TYPE_CHECKING
# Lightweight imports that are commonly used
from browser_use.llm.base import BaseChatModel
from browser_use.llm.deepseek.chat import ChatDeepSeek
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.groq.chat import ChatGroq
from browser_use.llm.messages import (
AssistantMessage,
BaseMessage,
@@ -27,11 +23,52 @@ from browser_use.llm.messages import (
from browser_use.llm.messages import (
ContentPartTextParam as ContentText,
)
from browser_use.llm.ollama.chat import ChatOllama
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.llm.openrouter.chat import ChatOpenRouter
# Make better names for the message
# Type stubs for lazy imports
if TYPE_CHECKING:
from browser_use.llm.anthropic.chat import ChatAnthropic
from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
from browser_use.llm.azure.chat import ChatAzureOpenAI
from browser_use.llm.deepseek.chat import ChatDeepSeek
from browser_use.llm.google.chat import ChatGoogle
from browser_use.llm.groq.chat import ChatGroq
from browser_use.llm.ollama.chat import ChatOllama
from browser_use.llm.openai.chat import ChatOpenAI
from browser_use.llm.openrouter.chat import ChatOpenRouter
# Lazy imports mapping for heavy chat models
_LAZY_IMPORTS = {
'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'),
'ChatAnthropicBedrock': ('browser_use.llm.aws.chat_anthropic', 'ChatAnthropicBedrock'),
'ChatAWSBedrock': ('browser_use.llm.aws.chat_bedrock', 'ChatAWSBedrock'),
'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'),
'ChatDeepSeek': ('browser_use.llm.deepseek.chat', 'ChatDeepSeek'),
'ChatGoogle': ('browser_use.llm.google.chat', 'ChatGoogle'),
'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'),
'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'),
'ChatOpenAI': ('browser_use.llm.openai.chat', 'ChatOpenAI'),
'ChatOpenRouter': ('browser_use.llm.openrouter.chat', 'ChatOpenRouter'),
}
def __getattr__(name: str):
"""Lazy import mechanism for heavy chat model imports."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
from importlib import import_module
module = import_module(module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = [
# Message types -> for easier transition from langchain

View File

@@ -1,5 +1,34 @@
from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
from typing import TYPE_CHECKING
# Type stubs for lazy imports
if TYPE_CHECKING:
from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock
from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock
# Lazy imports mapping for AWS chat models
_LAZY_IMPORTS = {
'ChatAnthropicBedrock': ('browser_use.llm.aws.chat_anthropic', 'ChatAnthropicBedrock'),
'ChatAWSBedrock': ('browser_use.llm.aws.chat_bedrock', 'ChatAWSBedrock'),
}
def __getattr__(name: str):
"""Lazy import mechanism for AWS chat models."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
from importlib import import_module
module = import_module(module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = [
'ChatAWSBedrock',

View File

@@ -35,7 +35,8 @@ class ChatOpenAI(BaseChatModel):
model: ChatModel | str
# Model params
temperature: float | None = None
temperature: float | None = 0.2
frequency_penalty: float | None = 0.05
reasoning_effort: ReasoningEffort = 'low'
# Client initialization parameters
@@ -50,6 +51,8 @@ class ChatOpenAI(BaseChatModel):
default_query: Mapping[str, object] | None = None
http_client: httpx.AsyncClient | None = None
_strict_response_validation: bool = False
max_completion_tokens: int | None = 8000
top_p: float | None = None
# Static
@property
@@ -144,12 +147,24 @@ class ChatOpenAI(BaseChatModel):
try:
model_params: dict[str, Any] = {}
if self.model in ReasoningModels:
model_params['reasoning_effort'] = self.reasoning_effort
if self.temperature is not None:
model_params['temperature'] = self.temperature
if self.frequency_penalty is not None:
model_params['frequency_penalty'] = self.frequency_penalty
if self.max_completion_tokens is not None:
model_params['max_completion_tokens'] = self.max_completion_tokens
if self.top_p is not None:
model_params['top_p'] = self.top_p
if self.model in ReasoningModels:
model_params['reasoning_effort'] = self.reasoning_effort
model_params['temperature'] = 1
model_params['frequency_penalty'] = 0
if output_format is None:
# Return string response
response = await self.get_client().chat.completions.create(

View File

@@ -659,7 +659,7 @@ class BrowserUseServer:
if not self.browser_session:
return 'Error: No browser session active'
state = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False)
state = await self.browser_session.get_browser_state_with_recovery(cache_clickable_elements_hashes=False)
result = {
'url': state.url,

View File

@@ -2,18 +2,50 @@
Telemetry for Browser Use.
"""
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import (
BaseTelemetryEvent,
CLITelemetryEvent,
MCPClientTelemetryEvent,
MCPServerTelemetryEvent,
)
from typing import TYPE_CHECKING
# Type stubs for lazy imports
if TYPE_CHECKING:
from browser_use.telemetry.service import ProductTelemetry
from browser_use.telemetry.views import (
BaseTelemetryEvent,
CLITelemetryEvent,
MCPClientTelemetryEvent,
MCPServerTelemetryEvent,
)
# Lazy imports mapping
_LAZY_IMPORTS = {
'ProductTelemetry': ('browser_use.telemetry.service', 'ProductTelemetry'),
'BaseTelemetryEvent': ('browser_use.telemetry.views', 'BaseTelemetryEvent'),
'CLITelemetryEvent': ('browser_use.telemetry.views', 'CLITelemetryEvent'),
'MCPClientTelemetryEvent': ('browser_use.telemetry.views', 'MCPClientTelemetryEvent'),
'MCPServerTelemetryEvent': ('browser_use.telemetry.views', 'MCPServerTelemetryEvent'),
}
def __getattr__(name: str):
"""Lazy import mechanism for telemetry components."""
if name in _LAZY_IMPORTS:
module_path, attr_name = _LAZY_IMPORTS[name]
try:
from importlib import import_module
module = import_module(module_path)
attr = getattr(module, attr_name)
# Cache the imported attribute in the module's globals
globals()[name] = attr
return attr
except ImportError as e:
raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e
raise AttributeError(f"module '{__name__}' has no attribute '{name}'")
__all__ = [
'BaseTelemetryEvent',
'ProductTelemetry',
'CLITelemetryEvent',
'MCPClientTelemetryEvent',
'MCPServerTelemetryEvent',
'CLITelemetryEvent',
]

View File

@@ -31,10 +31,10 @@ dependencies = [
"typing-extensions>=4.12.2",
"uuid7>=0.1.0",
"authlib>=1.6.0",
"google-genai>=1.21.1",
"openai>=1.81.0",
"anthropic>=0.54.0",
"groq>=0.28.0",
"google-genai>=1.26.0",
"openai>=1.97.0",
"anthropic>=0.58.2",
"groq>=0.30.0",
"ollama>=0.5.1",
"google-api-python-client>=2.174.0",
"google-auth>=2.40.3",

View File

@@ -328,8 +328,30 @@ class TestControllerIntegration:
assert result.extracted_content is not None
assert 'Waiting for' in result.extracted_content
# Verify that at least 1 second has passed
assert end_time - start_time >= 0.9 # Allow some timing margin
# Verify that less than 0.1 second has passed (because we deducted 3 seconds to account for the llm call)
assert end_time - start_time <= 0.1 # Allow some timing margin
# longer wait
# Create wait action for 1 second - fix to use a dictionary
wait_action = {'wait': {'seconds': 5}} # Corrected format
# Record start time
start_time = time.time()
# Execute wait action
result = await controller.act(WaitActionModel(**wait_action), browser_session)
# Record end time
end_time = time.time()
# Verify the result
assert isinstance(result, ActionResult)
assert result.extracted_content is not None
assert 'Waiting for' in result.extracted_content
# Verify that we took 2 sec (5s-3s (llm call)= 2s)
assert end_time - start_time <= 2.1 # Allow some timing margin
assert end_time - start_time >= 1.9 # Allow some timing margin
async def test_go_back_action(self, controller, browser_session, base_url):
"""Test that go_back action navigates to the previous page."""