diff --git a/browser_use/__init__.py b/browser_use/__init__.py index 1635b1e79..0c079efe7 100644 --- a/browser_use/__init__.py +++ b/browser_use/__init__.py @@ -1,4 +1,5 @@ import os +from typing import TYPE_CHECKING from browser_use.logging_config import setup_logging @@ -13,21 +14,6 @@ else: # Monkeypatch BaseSubprocessTransport.__del__ to handle closed event loops gracefully from asyncio import base_subprocess -from browser_use.agent.prompts import SystemPrompt -from browser_use.agent.service import Agent -from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList -from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession -from browser_use.controller.service import Controller -from browser_use.dom.service import DomService -from browser_use.llm import ( - ChatAnthropic, - ChatAzureOpenAI, - ChatGoogle, - ChatGroq, - ChatOllama, - ChatOpenAI, -) - _original_del = base_subprocess.BaseSubprocessTransport.__del__ @@ -50,6 +36,71 @@ def _patched_del(self): base_subprocess.BaseSubprocessTransport.__del__ = _patched_del +# Type stubs for lazy imports - fixes linter warnings +if TYPE_CHECKING: + from browser_use.agent.prompts import SystemPrompt + from browser_use.agent.service import Agent + from browser_use.agent.views import ActionModel, ActionResult, AgentHistoryList + from browser_use.browser import Browser, BrowserConfig, BrowserContext, BrowserContextConfig, BrowserProfile, BrowserSession + from browser_use.controller.service import Controller + from browser_use.dom.service import DomService + from browser_use.llm.anthropic.chat import ChatAnthropic + from browser_use.llm.azure.chat import ChatAzureOpenAI + from browser_use.llm.google.chat import ChatGoogle + from browser_use.llm.groq.chat import ChatGroq + from browser_use.llm.ollama.chat import ChatOllama + from browser_use.llm.openai.chat import ChatOpenAI + + +# Lazy imports mapping - only import when actually accessed +_LAZY_IMPORTS = { + # Agent service (heavy due to dependencies) + 'Agent': ('browser_use.agent.service', 'Agent'), + # System prompt (moderate weight due to agent.views imports) + 'SystemPrompt': ('browser_use.agent.prompts', 'SystemPrompt'), + # Agent views (very heavy - over 1 second!) + 'ActionModel': ('browser_use.agent.views', 'ActionModel'), + 'ActionResult': ('browser_use.agent.views', 'ActionResult'), + 'AgentHistoryList': ('browser_use.agent.views', 'AgentHistoryList'), + # Browser components (heavy due to playwright/patchright) + 'Browser': ('browser_use.browser', 'Browser'), + 'BrowserConfig': ('browser_use.browser', 'BrowserConfig'), + 'BrowserSession': ('browser_use.browser', 'BrowserSession'), + 'BrowserProfile': ('browser_use.browser', 'BrowserProfile'), + 'BrowserContext': ('browser_use.browser', 'BrowserContext'), + 'BrowserContextConfig': ('browser_use.browser', 'BrowserContextConfig'), + # Controller (moderate weight) + 'Controller': ('browser_use.controller.service', 'Controller'), + # DOM service (moderate weight) + 'DomService': ('browser_use.dom.service', 'DomService'), + # Chat models (very heavy imports) + 'ChatOpenAI': ('browser_use.llm.openai.chat', 'ChatOpenAI'), + 'ChatGoogle': ('browser_use.llm.google.chat', 'ChatGoogle'), + 'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'), + 'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'), + 'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'), + 'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'), +} + + +def __getattr__(name: str): + """Lazy import mechanism - only import modules when they're actually accessed.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + try: + from importlib import import_module + + module = import_module(module_path) + attr = getattr(module, attr_name) + # Cache the imported attribute in the module's globals + globals()[name] = attr + return attr + except ImportError as e: + raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + + __all__ = [ 'Agent', 'Browser', diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py index 48b3db5e9..eaf42326f 100644 --- a/browser_use/agent/service.py +++ b/browser_use/agent/service.py @@ -34,7 +34,8 @@ from bubus import EventBus from pydantic import ValidationError from uuid_extensions import uuid7str -from browser_use.agent.gif import create_history_gif +# Lazy import for gif to avoid heavy agent.views import at startup +# from browser_use.agent.gif import create_history_gif from browser_use.agent.message_manager.service import ( MessageManager, ) @@ -184,6 +185,8 @@ class Agent(Generic[Context, AgentStructuredOutput]): display_files_in_done_text: bool = True, include_tool_call_examples: bool = False, vision_detail_level: Literal['auto', 'low', 'high'] = 'auto', + llm_timeout: int = 60, + step_timeout: int = 180, **kwargs, ): # Check for deprecated planner parameters @@ -261,6 +264,8 @@ class Agent(Generic[Context, AgentStructuredOutput]): extend_planner_system_message=None, # Always None now (deprecated) calculate_cost=calculate_cost, include_tool_call_examples=include_tool_call_examples, + llm_timeout=llm_timeout, + step_timeout=step_timeout, ) # Token cost service @@ -280,7 +285,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self._set_browser_use_version_and_source(source) self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None - # Verify we can connect to the LLM and setup the tool calling method + # Verify we can connect to the model self._verify_and_setup_llm() # TODO: move this logic to the LLMs @@ -644,6 +649,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): self.task = new_task self._message_manager.add_new_task(new_task) + @observe_debug(ignore_input=True, ignore_output=True, name='_raise_if_stopped_or_paused') async def _raise_if_stopped_or_paused(self) -> None: """Utility function that raises an InterruptedError if the agent is stopped or paused.""" @@ -655,24 +661,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): # self.logger.debug('Agent paused after getting state') raise InterruptedError - @observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_with_recovery') - async def _get_browser_state_with_recovery(self, cache_clickable_elements_hashes: bool = True) -> BrowserStateSummary: - """Get browser state with multiple fallback strategies for error recovery""" - - assert self.browser_session is not None, 'BrowserSession is not set up' - - # Try 1: Full state summary (current implementation) - like main branch - try: - return await self.browser_session.get_state_summary(cache_clickable_elements_hashes) - except Exception as e: - if self.state.last_result is None: - self.state.last_result = [] - self.state.last_result.append(ActionResult(error=str(e))) - self.logger.warning(f'Full state retrieval failed: {type(e).__name__}: {e}') - - self.logger.warning('๐Ÿ”„ Falling back to minimal state summary') - return await self.browser_session.get_minimal_state_summary() - @observe(name='agent.step', ignore_output=True, ignore_input=True) @time_execution_async('--step') async def step(self, step_info: AgentStepInfo | None = None) -> None: @@ -707,7 +695,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): assert self.browser_session is not None, 'BrowserSession is not set up' self.logger.debug(f'๐ŸŒ Step {self.state.n_steps + 1}: Getting browser state...') - browser_state_summary = await self._get_browser_state_with_recovery(cache_clickable_elements_hashes=True) + browser_state_summary = await self.browser_session.get_browser_state_with_recovery( + cache_clickable_elements_hashes=True, include_screenshot=self.settings.use_vision + ) current_page = await self.browser_session.get_current_page() # Check for new downloads after getting browser state (catches PDF auto-downloads and previous step downloads) @@ -744,6 +734,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): await self._handle_final_step(step_info) return browser_state_summary + @observe_debug(ignore_input=True, name='get_next_action') async def _get_next_action(self, browser_state_summary: BrowserStateSummary) -> None: """Execute LLM interaction with retry logic and handle callbacks""" input_messages = self._message_manager.get_messages() @@ -751,7 +742,15 @@ class Agent(Generic[Context, AgentStructuredOutput]): f'๐Ÿค– Step {self.state.n_steps + 1}: Calling LLM with {len(input_messages)} messages (model: {self.llm.model})...' ) - model_output = await self._get_model_output_with_retry(input_messages) + try: + model_output = await asyncio.wait_for( + self._get_model_output_with_retry(input_messages), timeout=self.settings.llm_timeout + ) + except TimeoutError: + raise TimeoutError( + f'LLM call timed out after {self.settings.llm_timeout} seconds. Keep your thinking and output short.' + ) + self.state.last_model_output = model_output # Check again for paused/stopped state after getting model output @@ -988,6 +987,7 @@ class Agent(Generic[Context, AgentStructuredOutput]): return text.strip() @time_execution_async('--get_next_action') + @observe_debug(ignore_input=True, ignore_output=True, name='get_model_output') async def get_model_output(self, input_messages: list[BaseMessage]) -> AgentOutput: """Get next action from LLM based on current state""" @@ -1249,15 +1249,15 @@ class Agent(Generic[Context, AgentStructuredOutput]): try: await asyncio.wait_for( self.step(step_info), - timeout=300, # 5 minute step timeout - more generous for slow LLM calls + timeout=self.settings.step_timeout, ) self.logger.debug(f'โœ… Completed step {step + 1}/{max_steps}') except TimeoutError: # Handle step timeout gracefully - error_msg = f'Step {step + 1} timed out after 300 seconds' + error_msg = f'Step {step + 1} timed out after {self.settings.step_timeout} seconds' self.logger.error(f'โฐ {error_msg}') self.state.consecutive_failures += 1 - self.state.last_result = [ActionResult(error=error_msg, include_in_memory=True)] + self.state.last_result = [ActionResult(error=error_msg)] if on_step_end is not None: await on_step_end(self) @@ -1347,6 +1347,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): if isinstance(self.settings.generate_gif, str): output_path = self.settings.generate_gif + # Lazy import gif module to avoid heavy startup cost + from browser_use.agent.gif import create_history_gif + create_history_gif(task=self.task, history=self.state.history, output_path=output_path) # Emit output file generated event for GIF @@ -1381,56 +1384,63 @@ class Agent(Generic[Context, AgentStructuredOutput]): results: list[ActionResult] = [] assert self.browser_session is not None, 'BrowserSession is not set up' - cached_selector_map = await self.browser_session.get_selector_map() - cached_path_hashes = {e.hash.branch_path_hash for e in cached_selector_map.values()} - - try: - await self.browser_session.remove_highlights() - except TimeoutError: - # we don't care if this times out - self.logger.debug('Timeout to remove highlights') - - for i, action in enumerate(actions): - # DO NOT ALLOW TO CALL `done` AS A SINGLE ACTION - if i > 0 and action.model_dump(exclude_unset=True).get('done') is not None: - msg = f'Done action is allowed only as a single action - stopped after action {i} / {len(actions)}.' - logger.info(msg) + cached_selector_map = {} + cached_path_hashes = set() + # check all actions if any has index, if so, get the selector map + for action in actions: + if action.get_index() is not None: + cached_selector_map = await self.browser_session.get_selector_map() + cached_path_hashes = {e.hash.branch_path_hash for e in cached_selector_map.values()} break - if action.get_index() is not None and i != 0: - new_browser_state_summary = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False) - new_selector_map = new_browser_state_summary.selector_map - - # Detect index change after previous action - orig_target = cached_selector_map.get(action.get_index()) # type: ignore - orig_target_hash = orig_target.hash.branch_path_hash if orig_target else None - new_target = new_selector_map.get(action.get_index()) # type: ignore - new_target_hash = new_target.hash.branch_path_hash if new_target else None - if orig_target_hash != new_target_hash: - msg = f'Element index changed after action {i} / {len(actions)}, because page changed.' + # loop over actions and execute them + for i, action in enumerate(actions): + if i > 0: + # ONLY ALLOW TO CALL `done` IF IT IS A SINGLE ACTION + if action.model_dump(exclude_unset=True).get('done') is not None: + msg = f'Done action is allowed only as a single action - stopped after action {i} / {len(actions)}.' logger.info(msg) - results.append( - ActionResult( - extracted_content=msg, - include_in_memory=True, - long_term_memory=msg, - ) - ) break - new_path_hashes = {e.hash.branch_path_hash for e in new_selector_map.values()} - if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes): - # next action requires index but there are new elements on the page - msg = f'Something new appeared after action {i} / {len(actions)}, following actions are NOT executed and should be retried.' - logger.info(msg) - results.append( - ActionResult( - extracted_content=msg, - include_in_memory=True, - long_term_memory=msg, - ) + if action.get_index() is not None: + new_browser_state_summary = await self.browser_session.get_browser_state_with_recovery( + cache_clickable_elements_hashes=False, include_screenshot=False ) - break + new_selector_map = new_browser_state_summary.selector_map + + # Detect index change after previous action + orig_target = cached_selector_map.get(action.get_index()) # type: ignore + orig_target_hash = orig_target.hash.branch_path_hash if orig_target else None + new_target = new_selector_map.get(action.get_index()) # type: ignore + new_target_hash = new_target.hash.branch_path_hash if new_target else None + if orig_target_hash != new_target_hash: + msg = f'Element index changed after action {i} / {len(actions)}, because page changed.' + logger.info(msg) + results.append( + ActionResult( + extracted_content=msg, + include_in_memory=True, + long_term_memory=msg, + ) + ) + break + + new_path_hashes = {e.hash.branch_path_hash for e in new_selector_map.values()} + if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes): + # next action requires index but there are new elements on the page + msg = f'Something new appeared after action {i} / {len(actions)}, following actions are NOT executed and should be retried.' + logger.info(msg) + results.append( + ActionResult( + extracted_content=msg, + include_in_memory=True, + long_term_memory=msg, + ) + ) + break + + # wait between actions + await asyncio.sleep(self.browser_profile.wait_between_actions) try: await self._raise_if_stopped_or_paused() @@ -1455,9 +1465,6 @@ class Agent(Generic[Context, AgentStructuredOutput]): if results[-1].is_done or results[-1].error or i == len(actions) - 1: break - await asyncio.sleep(self.browser_profile.wait_between_actions) - # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page) - except Exception as e: # Handle any exceptions during action execution self.logger.error(f'Action {i + 1} failed: {type(e).__name__}: {e}') @@ -1535,7 +1542,9 @@ class Agent(Generic[Context, AgentStructuredOutput]): async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]: """Execute a single step from history with element validation""" assert self.browser_session is not None, 'BrowserSession is not set up' - state = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False) + state = await self.browser_session.get_browser_state_with_recovery( + cache_clickable_elements_hashes=False, include_screenshot=False + ) if not state or not history_item.model_output: raise ValueError('Invalid state or model output') updated_actions = [] diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py index e565704e2..20353950d 100644 --- a/browser_use/agent/views.py +++ b/browser_use/agent/views.py @@ -65,6 +65,8 @@ class AgentSettings(BaseModel): extend_planner_system_message: str | None = None calculate_cost: bool = False include_tool_call_examples: bool = False + llm_timeout: int = 60 # Timeout in seconds for LLM calls + step_timeout: int = 180 # Timeout in seconds for each step class AgentState(BaseModel): diff --git a/browser_use/browser/__init__.py b/browser_use/browser/__init__.py index eaea05808..d6c13f2d6 100644 --- a/browser_use/browser/__init__.py +++ b/browser_use/browser/__init__.py @@ -1,6 +1,41 @@ -from .browser import Browser, BrowserConfig -from .context import BrowserContext, BrowserContextConfig -from .profile import BrowserProfile -from .session import BrowserSession +from typing import TYPE_CHECKING + +# Type stubs for lazy imports +if TYPE_CHECKING: + from .browser import Browser, BrowserConfig + from .context import BrowserContext, BrowserContextConfig + from .profile import BrowserProfile + from .session import BrowserSession + +# Lazy imports mapping for heavy browser components +_LAZY_IMPORTS = { + 'Browser': ('.browser', 'Browser'), + 'BrowserConfig': ('.browser', 'BrowserConfig'), + 'BrowserContext': ('.context', 'BrowserContext'), + 'BrowserContextConfig': ('.context', 'BrowserContextConfig'), + 'BrowserProfile': ('.profile', 'BrowserProfile'), + 'BrowserSession': ('.session', 'BrowserSession'), +} + + +def __getattr__(name: str): + """Lazy import mechanism for heavy browser components.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + try: + from importlib import import_module + + # Use relative import for current package + full_module_path = f'browser_use.browser{module_path}' + module = import_module(full_module_path) + attr = getattr(module, attr_name) + # Cache the imported attribute in the module's globals + globals()[name] = attr + return attr + except ImportError as e: + raise ImportError(f'Failed to import {name} from {full_module_path}: {e}') from e + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + __all__ = ['Browser', 'BrowserConfig', 'BrowserContext', 'BrowserContextConfig', 'BrowserSession', 'BrowserProfile'] diff --git a/browser_use/browser/profile.py b/browser_use/browser/profile.py index b8ada7f1d..e3b9afdb9 100644 --- a/browser_use/browser/profile.py +++ b/browser_use/browser/profile.py @@ -169,6 +169,10 @@ CHROME_DEFAULT_ARGS = [ '--disable-desktop-notifications', '--noerrdialogs', '--silent-debugger-extension-api', + # Extension welcome tab suppression for automation + '--disable-extensions-http-throttling', + '--extensions-on-chrome-urls', + '--disable-default-apps', f'--disable-features={",".join(CHROME_DISABLED_COMPONENTS)}', ] @@ -558,6 +562,10 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro description='List of allowed domains for navigation e.g. ["*.google.com", "https://example.com", "chrome-extension://*"]', ) keep_alive: bool | None = Field(default=None, description='Keep browser alive after agent run.') + enable_default_extensions: bool = Field( + default=True, + description="Enable automation-optimized extensions: ad blocking (uBlock Origin), cookie handling (I still don't care about cookies), and URL cleaning (ClearURLs). All extensions work automatically without manual intervention. Extensions are automatically downloaded and loaded when enabled.", + ) window_size: ViewportSize | None = Field( default=None, description='Browser window size to use when headless=False.', @@ -620,6 +628,7 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro window_size['width'] = window_size['width'] or self.window_width or 1280 window_size['height'] = window_size['height'] or self.window_height or 1100 self.window_size = window_size + return self @model_validator(mode='after') @@ -699,12 +708,162 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro if self.window_position else [] ), + *(self._get_extension_args() if self.enable_default_extensions else []), ] # convert to dict and back to dedupe and merge duplicate args final_args_list = BrowserLaunchArgs.args_as_list(BrowserLaunchArgs.args_as_dict(pre_conversion_args)) return final_args_list + def _get_extension_args(self) -> list[str]: + """Get Chrome args for enabling default extensions (ad blocker and cookie handler).""" + extension_paths = self._ensure_default_extensions_downloaded() + + args = [ + '--enable-extensions', + '--disable-extensions-file-access-check', + '--disable-extensions-http-throttling', + '--enable-extension-activity-logging', + ] + + if extension_paths: + args.append(f'--load-extension={",".join(extension_paths)}') + + return args + + def _ensure_default_extensions_downloaded(self) -> list[str]: + """ + Ensure default extensions are downloaded and cached locally. + Returns list of paths to extension directories. + """ + from pathlib import Path + + # Extension definitions - optimized for automation and content extraction + extensions = [ + { + 'name': 'uBlock Origin', + 'id': 'cjpalhdlnbpafiamejdnhcphjbkeiagm', + 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dcjpalhdlnbpafiamejdnhcphjbkeiagm%26uc', + }, + { + 'name': "I still don't care about cookies", + 'id': 'edibdbjcniadpccecjdfdjjppcpchdlm', + 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dedibdbjcniadpccecjdfdjjppcpchdlm%26uc', + }, + { + 'name': 'ClearURLs', + 'id': 'lckanjgmijmafbedllaakclkaicjfmnk', + 'url': 'https://clients2.google.com/service/update2/crx?response=redirect&prodversion=130&acceptformat=crx3&x=id%3Dlckanjgmijmafbedllaakclkaicjfmnk%26uc', + }, + ] + + # Create extensions cache directory + cache_dir = Path.home() / '.browser-use' / 'extensions' + cache_dir.mkdir(parents=True, exist_ok=True) + + extension_paths = [] + loaded_extension_names = [] + + for ext in extensions: + ext_dir = cache_dir / ext['id'] + crx_file = cache_dir / f'{ext["id"]}.crx' + + # Check if extension is already extracted + if ext_dir.exists() and (ext_dir / 'manifest.json').exists(): + extension_paths.append(str(ext_dir)) + loaded_extension_names.append(ext['name']) + continue + + try: + # Download extension if not cached + if not crx_file.exists(): + logger.info(f'๐Ÿ“ฆ Downloading {ext["name"]} extension...') + self._download_extension(ext['url'], crx_file) + + # Extract extension + if crx_file.exists(): + logger.info(f'๐Ÿ“‚ Extracting {ext["name"]} extension...') + self._extract_extension(crx_file, ext_dir) + extension_paths.append(str(ext_dir)) + loaded_extension_names.append(ext['name']) + + except Exception as e: + logger.warning(f'โš ๏ธ Failed to setup {ext["name"]} extension: {e}') + continue + + if extension_paths: + logger.info(f'โœ… Extensions ready: {len(extension_paths)} extensions loaded ({", ".join(loaded_extension_names)})') + else: + logger.warning('โš ๏ธ No default extensions could be loaded') + + return extension_paths + + def _download_extension(self, url: str, output_path: Path) -> None: + """Download extension .crx file.""" + import urllib.request + + try: + with urllib.request.urlopen(url) as response: + with open(output_path, 'wb') as f: + f.write(response.read()) + except Exception as e: + raise Exception(f'Failed to download extension: {e}') + + def _extract_extension(self, crx_path: Path, extract_dir: Path) -> None: + """Extract .crx file to directory.""" + import os + import zipfile + + # Remove existing directory + if extract_dir.exists(): + import shutil + + shutil.rmtree(extract_dir) + + extract_dir.mkdir(parents=True, exist_ok=True) + + try: + # CRX files are ZIP files with a header, try to extract as ZIP + with zipfile.ZipFile(crx_path, 'r') as zip_ref: + zip_ref.extractall(extract_dir) + + # Verify manifest exists + if not (extract_dir / 'manifest.json').exists(): + raise Exception('No manifest.json found in extension') + + except zipfile.BadZipFile: + # CRX files have a header before the ZIP data + # Skip the CRX header and extract the ZIP part + with open(crx_path, 'rb') as f: + # Read CRX header to find ZIP start + magic = f.read(4) + if magic != b'Cr24': + raise Exception('Invalid CRX file format') + + version = int.from_bytes(f.read(4), 'little') + if version == 2: + pubkey_len = int.from_bytes(f.read(4), 'little') + sig_len = int.from_bytes(f.read(4), 'little') + f.seek(16 + pubkey_len + sig_len) # Skip to ZIP data + elif version == 3: + header_len = int.from_bytes(f.read(4), 'little') + f.seek(12 + header_len) # Skip to ZIP data + + # Extract ZIP data + zip_data = f.read() + + # Write ZIP data to temp file and extract + import tempfile + + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip: + temp_zip.write(zip_data) + temp_zip.flush() + + with zipfile.ZipFile(temp_zip.name, 'r') as zip_ref: + zip_ref.extractall(extract_dir) + + os.unlink(temp_zip.name) + def kwargs_for_launch_persistent_context(self) -> BrowserLaunchPersistentContextArgs: """Return the kwargs for BrowserType.launch().""" return BrowserLaunchPersistentContextArgs(**self.model_dump(exclude={'args'}), args=self.get_args()) @@ -721,22 +880,6 @@ class BrowserProfile(BrowserConnectArgs, BrowserLaunchPersistentContextArgs, Bro """Return the kwargs for BrowserType.connect_over_cdp().""" return BrowserLaunchArgs(**self.model_dump(exclude={'args'}), args=self.get_args()) - # def preinstall_extensions(self) -> None: - # """Preinstall the extensions.""" - - # # create the local unpacked extensions dir - # extensions_dir = self.user_data_dir / 'Extensions' - # extensions_dir.mkdir(parents=True, exist_ok=True) - - # # download from the chrome web store using the chrome web store api - # for extension_id in self.extension_ids_to_preinstall: - # extension_path = extensions_dir / f'{extension_id}.crx' - # if extension_path.exists(): - # logger.warning(f'โš ๏ธ Extension {extension_id} is already installed, skipping preinstall.') - # else: - # logger.info(f'๐Ÿ” Preinstalling extension {extension_id}...') - # # TODO: copy this from ArchiveBox implementation - @observe_debug(ignore_input=True, ignore_output=True, name='detect_display_configuration') def detect_display_configuration(self) -> None: """ diff --git a/browser_use/browser/session.py b/browser_use/browser/session.py index 17b5b7667..b068f952d 100644 --- a/browser_use/browser/session.py +++ b/browser_use/browser/session.py @@ -51,8 +51,10 @@ from browser_use.browser.views import ( TabInfo, URLNotAllowedError, ) -from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor -from browser_use.dom.service import DomService + +# Lazy imports for heavy DOM services to improve startup time +# from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor +# from browser_use.dom.service import DomService from browser_use.dom.views import DOMElementNode, SelectorMap from browser_use.utils import ( is_new_tab_page, @@ -160,12 +162,14 @@ def require_healthy_browser(usable_page=True, reopen_page=True): await self._recover_unresponsive_page( func.__name__, timeout_ms=int(self.browser_profile.default_navigation_timeout or 5000) + 5_000 ) + page_url = self.agent_current_page.url if self.agent_current_page else 'unknown page' self.logger.debug( - f'๐Ÿค• Crashed page recovery finished, attempting to continue with {func.__name__}() on {_log_pretty_url(self.agent_current_page.url)}...' + f'๐Ÿค• Crashed page recovery finished, attempting to continue with {func.__name__}() on {_log_pretty_url(page_url)}...' ) except Exception as e: + page_url = self.agent_current_page.url if self.agent_current_page else 'unknown page' self.logger.warning( - f'โŒ Crashed page recovery failed, could not run {func.__name__}(), page is stuck unresponsive on {_log_pretty_url(self.agent_current_page.url)}...' + f'โŒ Crashed page recovery failed, could not run {func.__name__}(), page is stuck unresponsive on {_log_pretty_url(page_url)}...' ) raise # Re-raise to let retry decorator / callsite handle it @@ -384,10 +388,19 @@ class BrowserSession(BaseModel): # Ensure we have a context assert self.browser_context, f'Failed to create BrowserContext for browser={self.browser}' - # Configure browser - await self._setup_viewports() - await self._setup_current_page_change_listeners() - await self._start_context_tracing() + # Configure browser - run some setup tasks in parallel for speed + setup_results = await asyncio.gather( + self._setup_viewports(), + self._setup_current_page_change_listeners(), + self._start_context_tracing(), + return_exceptions=True, + ) + + # Check for exceptions in setup results + for i, result in enumerate(setup_results): + if isinstance(result, Exception): + setup_task_names = ['_setup_viewports', '_setup_current_page_change_listeners', '_start_context_tracing'] + raise Exception(f'Browser setup failed in {setup_task_names[i]}: {result}') from result self.initialized = True return self @@ -837,6 +850,7 @@ class BrowserSession(BaseModel): atexit.register(shudown_playwright) + @observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_passed_objects') async def setup_browser_via_passed_objects(self) -> None: """Override to customize the set up of the connection to an existing browser""" @@ -878,6 +892,7 @@ class BrowserSession(BaseModel): self.logger.info(f'๐ŸŽญ Connected to existing user-provided browser: {self.browser_context}') self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end + @observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_browser_pid') async def setup_browser_via_browser_pid(self) -> None: """if browser_pid is provided, calcuclate its CDP URL by looking for --remote-debugging-port=... in its CLI args, then connect to it""" @@ -922,11 +937,10 @@ class BrowserSession(BaseModel): # Wait for CDP port to become available (Chrome might still be starting) import httpx - # Add initial delay to give Chrome time to start up before first check - await asyncio.sleep(2) + # No initial sleep needed - the polling loop below handles waiting if Chrome isn't ready yet async with httpx.AsyncClient() as client: - for i in range(30): # 30 second timeout + for i in range(30): # timeout # First check if the Chrome process has exited try: chrome_process = psutil.Process(pid=self.browser_pid) @@ -988,7 +1002,7 @@ class BrowserSession(BaseModel): except (httpx.ConnectError, httpx.TimeoutException): if i == 0: self.logger.debug(f'โณ Waiting for Chrome CDP port {debug_port} to become available...') - await asyncio.sleep(1) + await asyncio.sleep(0.5) else: self.logger.error(f'โŒ Chrome CDP port {debug_port} did not become available after 30 seconds') self.browser_pid = None @@ -1010,6 +1024,7 @@ class BrowserSession(BaseModel): ) self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end + @observe_debug(ignore_input=True, ignore_output=True, name='setup_browser_via_wss_url') async def setup_browser_via_wss_url(self) -> None: """check for a passed wss_url, connect to a remote playwright browser server via WSS""" @@ -1044,7 +1059,8 @@ class BrowserSession(BaseModel): ) self._set_browser_keep_alive(True) # we connected to an existing browser, dont kill it at the end - @retry(wait=1, retries=2, timeout=45, semaphore_limit=1, semaphore_scope='self', semaphore_lax=False) + @observe_debug(ignore_input=True, ignore_output=True, name='setup_new_browser_context') + @retry(wait=0.1, retries=5, timeout=45, semaphore_limit=1, semaphore_scope='self', semaphore_lax=False) async def setup_new_browser_context(self) -> None: """Launch a new browser and browser_context""" # Double-check after semaphore acquisition to prevent duplicate browser launches @@ -1059,6 +1075,7 @@ class BrowserSession(BaseModel): pass await self._unsafe_setup_new_browser_context() + @observe_debug(ignore_input=True, ignore_output=True, name='_unsafe_setup_new_browser_context') async def _unsafe_setup_new_browser_context(self) -> None: """Unsafe browser context setup without retry protection.""" @@ -2015,7 +2032,6 @@ class BrowserSession(BaseModel): await page.wait_for_selector(selector, state='visible', timeout=timeout) @observe_debug(name='remove_highlights', ignore_output=True, ignore_input=True) - @require_healthy_browser(usable_page=True, reopen_page=True) @time_execution_async('--remove_highlights') @retry(timeout=2, retries=0) async def remove_highlights(self): @@ -2048,14 +2064,16 @@ class BrowserSession(BaseModel): self.logger.debug(f'โš ๏ธ Failed to remove highlights (this is usually ok): {type(e).__name__}: {e}') # Don't raise the error since this is not critical functionality + @observe_debug(ignore_output=True, name='get_dom_element_by_index') @require_healthy_browser(usable_page=True, reopen_page=True) async def get_dom_element_by_index(self, index: int) -> DOMElementNode | None: """Get DOM element by index.""" selector_map = await self.get_selector_map() return selector_map.get(index) - @require_healthy_browser(usable_page=True, reopen_page=True) @time_execution_async('--click_element_node') + @observe_debug(ignore_input=True, name='click_element_node') + @require_healthy_browser(usable_page=True, reopen_page=True) async def _click_element_node(self, element_node: DOMElementNode) -> str | None: """ Optimized method to click an element using xpath. @@ -2069,7 +2087,8 @@ class BrowserSession(BaseModel): element_handle = await self.get_locate_element(element_node) if element_handle is None: - raise Exception(f'Element: {repr(element_node)} not found') + self.logger.debug(f'Element: {repr(element_node)} not found') + raise Exception('Element not found') async def perform_click(click_func): """Performs the actual click, handling both download and navigation scenarios.""" @@ -2163,10 +2182,10 @@ class BrowserSession(BaseModel): except URLNotAllowedError as e: raise e except Exception as e: - raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}') + raise Exception(f'Failed to click element. Error: {str(e)}') @time_execution_async('--get_tabs_info') - @retry(timeout=6, retries=1) + @retry(timeout=3, retries=1) @require_healthy_browser(usable_page=False, reopen_page=False) async def get_tabs_info(self) -> list[TabInfo]: """Get information about all tabs""" @@ -2174,7 +2193,7 @@ class BrowserSession(BaseModel): tabs_info = [] for page_id, page in enumerate(self.browser_context.pages): try: - title = await asyncio.wait_for(page.title(), timeout=3.0) + title = await asyncio.wait_for(page.title(), timeout=2.0) tab_info = TabInfo(page_id=page_id, url=page.url, title=title) except Exception: # page.title() can hang forever on tabs that are crashed/disappeared/about:blank @@ -2255,8 +2274,14 @@ class BrowserSession(BaseModel): # Check if URL is allowed if not self._is_url_allowed(normalized_url): raise BrowserError(f'โ›”๏ธ Navigation to non-allowed URL: {normalized_url}') - - timeout_ms = min(3000, int(timeout_ms or self.browser_profile.default_navigation_timeout or 12000)) + # If timeout_ms is not None, use it (even if 0); else try profile.default_navigation_timeout (even if 0); else 12000 + if timeout_ms is not None: + user_timeout_ms = int(timeout_ms) + elif self.browser_profile.default_navigation_timeout is not None: + user_timeout_ms = int(self.browser_profile.default_navigation_timeout) + else: + user_timeout_ms = 12000 + timeout_ms = min(3000, user_timeout_ms) # Handle new tab creation if new_tab: @@ -2279,7 +2304,7 @@ class BrowserSession(BaseModel): # Navigate to URL try: - # Use asyncio.wait to prevent hanging on slow page loads + # Use asyncio.wait to prevent hanging on a slow page loads # Don't cap the timeout - respect what was requested self.logger.debug(f'๐Ÿงญ Starting navigation to {_log_pretty_url(normalized_url)} with timeout {timeout_ms}ms') nav_task = asyncio.create_task(page.goto(normalized_url, wait_until='load', timeout=timeout_ms)) @@ -2797,15 +2822,27 @@ class BrowserSession(BaseModel): @observe_debug(ignore_input=True, ignore_output=True, name='wait_for_page_and_frames_load') async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None): """ - Ensures page is fully loaded before continuing. - Waits for either network to be idle or minimum WAIT_TIME, whichever is longer. + Ensures page is fully loaded and stable before continuing. + Waits for network idle, DOM stability, and minimum WAIT_TIME. Also checks if the loaded URL is allowed. + + Parameters: + ----------- + timeout_overwrite: float | None + Override the minimum wait time """ # Start timing start_time = time.time() # Wait for page load page = await self.get_current_page() + + # Skip network waiting for new tab pages (about:blank, chrome://new-tab-page, etc.) + # These pages load instantly and don't need network idle time + if is_new_tab_page(page.url): + self.logger.debug(f'โšก Skipping page load wait for new tab page: {page.url}') + return + try: await self._wait_for_stable_network() @@ -3052,7 +3089,9 @@ class BrowserSession(BaseModel): @observe_debug(ignore_input=True, ignore_output=True) @time_execution_async('--get_state_summary') @require_healthy_browser(usable_page=True, reopen_page=True) - async def get_state_summary(self, cache_clickable_elements_hashes: bool) -> BrowserStateSummary: + async def get_state_summary( + self, cache_clickable_elements_hashes: bool, include_screenshot: bool = True + ) -> BrowserStateSummary: self.logger.debug('๐Ÿ”„ Starting get_state_summary...') """Get a summary of the current browser state @@ -3065,13 +3104,19 @@ class BrowserSession(BaseModel): If True, cache the clickable elements hashes for the current state. This is used to calculate which elements are new to the LLM since the last message, which helps reduce token usage. + include_screenshot: bool + If True, include screenshot in the state summary. Set to False to improve performance + when screenshots are not needed (e.g., in multi_act element validation). """ - await self._wait_for_page_and_frames_load() - updated_state = await self._get_updated_state() + + updated_state = await self._get_updated_state(include_screenshot=include_screenshot) # Find out which elements are new # Do this only if url has not changed if cache_clickable_elements_hashes: + # Lazy import heavy DOM service + from browser_use.dom.clickable_element_processor.service import ClickableElementProcessor + # if we are on the same url as the last state, we can use the cached hashes if self._cached_clickable_element_hashes and self._cached_clickable_element_hashes.url == updated_state.url: # Pointers, feel free to edit in place @@ -3142,20 +3187,12 @@ class BrowserSession(BaseModel): ) @observe_debug(ignore_input=True, ignore_output=True, name='get_updated_state') - async def _get_updated_state(self, focus_element: int = -1) -> BrowserStateSummary: + async def _get_updated_state(self, focus_element: int = -1, include_screenshot: bool = True) -> BrowserStateSummary: """Update and return state.""" # Check if current page is still valid, if not switch to another available page page = await self.get_current_page() - try: - # Test if page is still accessible - # NOTE: This also happens on invalid urls like www.sadfdsafdssdafd.com - await asyncio.wait_for(page.evaluate('1'), timeout=2.5) - except Exception as e: - self.logger.debug(f'๐Ÿ‘‹ Current page is not accessible: {type(e).__name__}: {e}') - raise BrowserError('Page is not accessible') - try: self.logger.debug('๐Ÿงน Removing highlights...') try: @@ -3172,6 +3209,8 @@ class BrowserSession(BaseModel): self.logger.debug(f'PDF auto-download check failed: {type(e).__name__}: {e}') self.logger.debug('๐ŸŒณ Starting DOM processing...') + from browser_use.dom.service import DomService + dom_service = DomService(page, logger=self.logger) try: content = await asyncio.wait_for( @@ -3228,13 +3267,16 @@ class BrowserSession(BaseModel): # ) # ) - try: - self.logger.debug('๐Ÿ“ธ Capturing screenshot...') - # Reasonable timeout for screenshot - screenshot_b64 = await self.take_screenshot() - # self.logger.debug('โœ… Screenshot completed') - except Exception as e: - self.logger.warning(f'โŒ Screenshot failed for {_log_pretty_url(page.url)}: {type(e).__name__} {e}') + if include_screenshot: + try: + self.logger.debug('๐Ÿ“ธ Capturing screenshot...') + # Reasonable timeout for screenshot + screenshot_b64 = await self.take_screenshot() + # self.logger.debug('โœ… Screenshot completed') + except Exception as e: + self.logger.warning(f'โŒ Screenshot failed for {_log_pretty_url(page.url)}: {type(e).__name__} {e}') + screenshot_b64 = None + else: screenshot_b64 = None # Get comprehensive page information @@ -3475,6 +3517,7 @@ class BrowserSession(BaseModel): 'Browser is unable to load any new about:blank pages (something is very wrong or browser is extremely overloaded)' ) + @observe_debug(ignore_input=True, name='recover_unresponsive_page') async def _recover_unresponsive_page(self, calling_method: str, timeout_ms: int | None = None) -> None: """Recover from an unresponsive page by closing and reopening it.""" self.logger.warning(f'โš ๏ธ Page JS engine became unresponsive in {calling_method}(), attempting recovery...') @@ -3828,6 +3871,7 @@ class BrowserSession(BaseModel): @require_healthy_browser(usable_page=True, reopen_page=True) @time_execution_async('--get_locate_element') + @observe_debug(ignore_input=True, name='get_locate_element') async def get_locate_element(self, element: DOMElementNode) -> ElementHandle | None: page = await self.get_current_page() current_frame = page @@ -3881,7 +3925,7 @@ class BrowserSession(BaseModel): if element_handle: is_visible = await self._is_visible(element_handle) if is_visible: - await element_handle.scroll_into_view_if_needed() + await element_handle.scroll_into_view_if_needed(timeout=1_000) return element_handle return None except Exception as e: @@ -3897,7 +3941,7 @@ class BrowserSession(BaseModel): if element_handle: is_visible = await self._is_visible(element_handle) if is_visible: - await element_handle.scroll_into_view_if_needed() + await element_handle.scroll_into_view_if_needed(timeout=1_000) return element_handle except Exception as xpath_e: self.logger.error( @@ -3924,7 +3968,7 @@ class BrowserSession(BaseModel): if element_handle: is_visible = await self._is_visible(element_handle) if is_visible: - await element_handle.scroll_into_view_if_needed() + await element_handle.scroll_into_view_if_needed(timeout=1_000) return element_handle return None except Exception as e: @@ -3945,7 +3989,7 @@ class BrowserSession(BaseModel): if element_handle: is_visible = await self._is_visible(element_handle) if is_visible: - await element_handle.scroll_into_view_if_needed() + await element_handle.scroll_into_view_if_needed(timeout=1_000) return element_handle return None except Exception as e: @@ -3989,7 +4033,7 @@ class BrowserSession(BaseModel): is_visible = await self._is_visible(element_handle) if is_visible: - await element_handle.scroll_into_view_if_needed() + await element_handle.scroll_into_view_if_needed(timeout=1_000) return element_handle except Exception as e: self.logger.error( @@ -3999,6 +4043,7 @@ class BrowserSession(BaseModel): @require_healthy_browser(usable_page=True, reopen_page=True) @time_execution_async('--input_text_element_node') + @observe_debug(ignore_input=True, name='input_text_element_node') async def _input_text_element_node(self, element_node: DOMElementNode, text: str): """ Input text into an element with proper error handling and state management. @@ -4022,7 +4067,7 @@ class BrowserSession(BaseModel): # let's first try to click and type try: await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}') - await element_handle.click() + await element_handle.click(timeout=2_000) # Add 2 second timeout await asyncio.sleep(0.1) # Increased sleep time page = await self.get_current_page() await page.keyboard.type(text) @@ -4044,9 +4089,9 @@ class BrowserSession(BaseModel): try: if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled): await element_handle.evaluate('el => {el.textContent = ""; el.value = "";}') - await element_handle.type(text, delay=5) + await element_handle.type(text, delay=5, timeout=5_000) # Add 5 second timeout else: - await element_handle.fill(text) + await element_handle.fill(text, timeout=3_000) # Add 3 second timeout except Exception as e: self.logger.error(f'Error during input text into element: {type(e).__name__}: {e}') raise BrowserError(f'Failed to input text into element: {repr(element_node)}') @@ -4471,32 +4516,29 @@ class BrowserSession(BaseModel): except Exception as e: self.logger.debug(f'โŒ Failed to show ๐Ÿ“€ DVD loading animation: {type(e).__name__}: {e}') - @observe_debug(ignore_input=True, ignore_output=True, name='get_state_summary_with_fallback') - @require_healthy_browser(usable_page=True, reopen_page=True) - @time_execution_async('--get_state_summary_with_fallback') - async def get_state_summary_with_fallback(self, cache_clickable_elements_hashes: bool = True) -> BrowserStateSummary: - """Get browser state with fallback to minimal state on errors - - This method first tries to get a full state summary. If that fails, - it falls back to a minimal state summary to allow basic navigation. + @observe_debug(ignore_input=True, ignore_output=True, name='get_browser_state_with_recovery') + async def get_browser_state_with_recovery( + self, cache_clickable_elements_hashes: bool = True, include_screenshot: bool = True + ) -> BrowserStateSummary: + """Get browser state with multiple fallback strategies for error recovery Parameters: ----------- cache_clickable_elements_hashes: bool If True, cache the clickable elements hashes for the current state. - - Returns: - -------- - BrowserStateSummary: Either full state or minimal fallback state + include_screenshot: bool + If True, include screenshot in the state summary. Set to False to improve performance + when screenshots are not needed (e.g., in multi_act element validation). """ - # Try 1: Full state summary (current implementation) + + # Try 1: Full state summary (current implementation) - like main branch try: - return await self.get_state_summary(cache_clickable_elements_hashes) + await self._wait_for_page_and_frames_load() + return await self.get_state_summary(cache_clickable_elements_hashes, include_screenshot=include_screenshot) except Exception as e: self.logger.warning(f'Full state retrieval failed: {type(e).__name__}: {e}') - self.logger.warning('๐Ÿ”„ Falling back to minimal state summary') - # Try 2: Minimal state summary as fallback + self.logger.warning('๐Ÿ”„ Falling back to minimal state summary') return await self.get_minimal_state_summary() async def _is_pdf_viewer(self, page: Page) -> bool: diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py index ff9a180b1..c5f1e210b 100644 --- a/browser_use/controller/service.py +++ b/browser_use/controller/service.py @@ -130,23 +130,20 @@ class Controller(Generic[Context]): await browser_session.go_back() msg = '๐Ÿ”™ Navigated back' logger.info(msg) - return ActionResult(extracted_content=msg, include_in_memory=True, long_term_memory='Navigated back') + return ActionResult(extracted_content=msg) - # wait for x seconds - - @self.registry.action('Wait for x seconds default 3 (max 10 seconds)') + @self.registry.action( + 'Wait for x seconds default 3 (max 10 seconds). This can be used to wait until the page is fully loaded.' + ) async def wait(seconds: int = 3): # Cap wait time at maximum 10 seconds - actual_seconds = min(max(seconds, 0), 10) - if actual_seconds != seconds: - msg = f'๐Ÿ•’ Waiting for {actual_seconds} seconds (capped from {seconds} seconds, max 10 seconds)' - else: - msg = f'๐Ÿ•’ Waiting for {actual_seconds} seconds' + # Reduce the wait time by 3 seconds to account for the llm call which takes at least 3 seconds + # So if the model decides to wait for 5 seconds, the llm call took at least 3 seconds, so we only need to wait for 2 seconds + actual_seconds = min(max(seconds - 3, 0), 10) + msg = f'๐Ÿ•’ Waiting for {actual_seconds + 3} seconds' logger.info(msg) await asyncio.sleep(actual_seconds) - return ActionResult( - extracted_content=msg, include_in_memory=True, long_term_memory=f'Waited for {actual_seconds} seconds' - ) + return ActionResult(extracted_content=msg) # Element Interaction Actions diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py index dc059732a..1bc0000a3 100644 --- a/browser_use/dom/service.py +++ b/browser_use/dom/service.py @@ -15,6 +15,7 @@ from browser_use.dom.views import ( SelectorMap, ViewportInfo, ) +from browser_use.observability import observe_debug from browser_use.utils import is_new_tab_page, time_execution_async # @dataclass @@ -34,6 +35,7 @@ class DomService: self.js_code = resources.files('browser_use.dom.dom_tree').joinpath('index.js').read_text() # region - Clickable elements + @observe_debug(ignore_input=True, ignore_output=True, name='get_clickable_elements') @time_execution_async('--get_clickable_elements') async def get_clickable_elements( self, diff --git a/browser_use/llm/__init__.py b/browser_use/llm/__init__.py index 59169ba34..f409f1839 100644 --- a/browser_use/llm/__init__.py +++ b/browser_use/llm/__init__.py @@ -4,14 +4,10 @@ We have switched all of our code from langchain to openai.types.chat.chat_comple For easier transition we have """ -from browser_use.llm.anthropic.chat import ChatAnthropic -from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock -from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock -from browser_use.llm.azure.chat import ChatAzureOpenAI +from typing import TYPE_CHECKING + +# Lightweight imports that are commonly used from browser_use.llm.base import BaseChatModel -from browser_use.llm.deepseek.chat import ChatDeepSeek -from browser_use.llm.google.chat import ChatGoogle -from browser_use.llm.groq.chat import ChatGroq from browser_use.llm.messages import ( AssistantMessage, BaseMessage, @@ -27,11 +23,52 @@ from browser_use.llm.messages import ( from browser_use.llm.messages import ( ContentPartTextParam as ContentText, ) -from browser_use.llm.ollama.chat import ChatOllama -from browser_use.llm.openai.chat import ChatOpenAI -from browser_use.llm.openrouter.chat import ChatOpenRouter -# Make better names for the message +# Type stubs for lazy imports +if TYPE_CHECKING: + from browser_use.llm.anthropic.chat import ChatAnthropic + from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock + from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock + from browser_use.llm.azure.chat import ChatAzureOpenAI + from browser_use.llm.deepseek.chat import ChatDeepSeek + from browser_use.llm.google.chat import ChatGoogle + from browser_use.llm.groq.chat import ChatGroq + from browser_use.llm.ollama.chat import ChatOllama + from browser_use.llm.openai.chat import ChatOpenAI + from browser_use.llm.openrouter.chat import ChatOpenRouter + +# Lazy imports mapping for heavy chat models +_LAZY_IMPORTS = { + 'ChatAnthropic': ('browser_use.llm.anthropic.chat', 'ChatAnthropic'), + 'ChatAnthropicBedrock': ('browser_use.llm.aws.chat_anthropic', 'ChatAnthropicBedrock'), + 'ChatAWSBedrock': ('browser_use.llm.aws.chat_bedrock', 'ChatAWSBedrock'), + 'ChatAzureOpenAI': ('browser_use.llm.azure.chat', 'ChatAzureOpenAI'), + 'ChatDeepSeek': ('browser_use.llm.deepseek.chat', 'ChatDeepSeek'), + 'ChatGoogle': ('browser_use.llm.google.chat', 'ChatGoogle'), + 'ChatGroq': ('browser_use.llm.groq.chat', 'ChatGroq'), + 'ChatOllama': ('browser_use.llm.ollama.chat', 'ChatOllama'), + 'ChatOpenAI': ('browser_use.llm.openai.chat', 'ChatOpenAI'), + 'ChatOpenRouter': ('browser_use.llm.openrouter.chat', 'ChatOpenRouter'), +} + + +def __getattr__(name: str): + """Lazy import mechanism for heavy chat model imports.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + try: + from importlib import import_module + + module = import_module(module_path) + attr = getattr(module, attr_name) + # Cache the imported attribute in the module's globals + globals()[name] = attr + return attr + except ImportError as e: + raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + __all__ = [ # Message types -> for easier transition from langchain diff --git a/browser_use/llm/aws/__init__.py b/browser_use/llm/aws/__init__.py index 69afb3a95..cb2def920 100644 --- a/browser_use/llm/aws/__init__.py +++ b/browser_use/llm/aws/__init__.py @@ -1,5 +1,34 @@ -from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock -from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock +from typing import TYPE_CHECKING + +# Type stubs for lazy imports +if TYPE_CHECKING: + from browser_use.llm.aws.chat_anthropic import ChatAnthropicBedrock + from browser_use.llm.aws.chat_bedrock import ChatAWSBedrock + +# Lazy imports mapping for AWS chat models +_LAZY_IMPORTS = { + 'ChatAnthropicBedrock': ('browser_use.llm.aws.chat_anthropic', 'ChatAnthropicBedrock'), + 'ChatAWSBedrock': ('browser_use.llm.aws.chat_bedrock', 'ChatAWSBedrock'), +} + + +def __getattr__(name: str): + """Lazy import mechanism for AWS chat models.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + try: + from importlib import import_module + + module = import_module(module_path) + attr = getattr(module, attr_name) + # Cache the imported attribute in the module's globals + globals()[name] = attr + return attr + except ImportError as e: + raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + __all__ = [ 'ChatAWSBedrock', diff --git a/browser_use/llm/openai/chat.py b/browser_use/llm/openai/chat.py index 96ce849fa..4883478b1 100644 --- a/browser_use/llm/openai/chat.py +++ b/browser_use/llm/openai/chat.py @@ -35,7 +35,8 @@ class ChatOpenAI(BaseChatModel): model: ChatModel | str # Model params - temperature: float | None = None + temperature: float | None = 0.2 + frequency_penalty: float | None = 0.05 reasoning_effort: ReasoningEffort = 'low' # Client initialization parameters @@ -50,6 +51,8 @@ class ChatOpenAI(BaseChatModel): default_query: Mapping[str, object] | None = None http_client: httpx.AsyncClient | None = None _strict_response_validation: bool = False + max_completion_tokens: int | None = 8000 + top_p: float | None = None # Static @property @@ -144,12 +147,24 @@ class ChatOpenAI(BaseChatModel): try: model_params: dict[str, Any] = {} - if self.model in ReasoningModels: - model_params['reasoning_effort'] = self.reasoning_effort if self.temperature is not None: model_params['temperature'] = self.temperature + if self.frequency_penalty is not None: + model_params['frequency_penalty'] = self.frequency_penalty + + if self.max_completion_tokens is not None: + model_params['max_completion_tokens'] = self.max_completion_tokens + + if self.top_p is not None: + model_params['top_p'] = self.top_p + + if self.model in ReasoningModels: + model_params['reasoning_effort'] = self.reasoning_effort + model_params['temperature'] = 1 + model_params['frequency_penalty'] = 0 + if output_format is None: # Return string response response = await self.get_client().chat.completions.create( diff --git a/browser_use/mcp/server.py b/browser_use/mcp/server.py index 84a0e840b..55d35ee82 100644 --- a/browser_use/mcp/server.py +++ b/browser_use/mcp/server.py @@ -659,7 +659,7 @@ class BrowserUseServer: if not self.browser_session: return 'Error: No browser session active' - state = await self.browser_session.get_state_summary(cache_clickable_elements_hashes=False) + state = await self.browser_session.get_browser_state_with_recovery(cache_clickable_elements_hashes=False) result = { 'url': state.url, diff --git a/browser_use/telemetry/__init__.py b/browser_use/telemetry/__init__.py index 40282dc9e..222bc9ba1 100644 --- a/browser_use/telemetry/__init__.py +++ b/browser_use/telemetry/__init__.py @@ -2,18 +2,50 @@ Telemetry for Browser Use. """ -from browser_use.telemetry.service import ProductTelemetry -from browser_use.telemetry.views import ( - BaseTelemetryEvent, - CLITelemetryEvent, - MCPClientTelemetryEvent, - MCPServerTelemetryEvent, -) +from typing import TYPE_CHECKING + +# Type stubs for lazy imports +if TYPE_CHECKING: + from browser_use.telemetry.service import ProductTelemetry + from browser_use.telemetry.views import ( + BaseTelemetryEvent, + CLITelemetryEvent, + MCPClientTelemetryEvent, + MCPServerTelemetryEvent, + ) + +# Lazy imports mapping +_LAZY_IMPORTS = { + 'ProductTelemetry': ('browser_use.telemetry.service', 'ProductTelemetry'), + 'BaseTelemetryEvent': ('browser_use.telemetry.views', 'BaseTelemetryEvent'), + 'CLITelemetryEvent': ('browser_use.telemetry.views', 'CLITelemetryEvent'), + 'MCPClientTelemetryEvent': ('browser_use.telemetry.views', 'MCPClientTelemetryEvent'), + 'MCPServerTelemetryEvent': ('browser_use.telemetry.views', 'MCPServerTelemetryEvent'), +} + + +def __getattr__(name: str): + """Lazy import mechanism for telemetry components.""" + if name in _LAZY_IMPORTS: + module_path, attr_name = _LAZY_IMPORTS[name] + try: + from importlib import import_module + + module = import_module(module_path) + attr = getattr(module, attr_name) + # Cache the imported attribute in the module's globals + globals()[name] = attr + return attr + except ImportError as e: + raise ImportError(f'Failed to import {name} from {module_path}: {e}') from e + + raise AttributeError(f"module '{__name__}' has no attribute '{name}'") + __all__ = [ 'BaseTelemetryEvent', 'ProductTelemetry', + 'CLITelemetryEvent', 'MCPClientTelemetryEvent', 'MCPServerTelemetryEvent', - 'CLITelemetryEvent', ] diff --git a/pyproject.toml b/pyproject.toml index 47fc212dd..9d3cbb632 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,10 +31,10 @@ dependencies = [ "typing-extensions>=4.12.2", "uuid7>=0.1.0", "authlib>=1.6.0", - "google-genai>=1.21.1", - "openai>=1.81.0", - "anthropic>=0.54.0", - "groq>=0.28.0", + "google-genai>=1.26.0", + "openai>=1.97.0", + "anthropic>=0.58.2", + "groq>=0.30.0", "ollama>=0.5.1", "google-api-python-client>=2.174.0", "google-auth>=2.40.3", diff --git a/tests/ci/test_controller.py b/tests/ci/test_controller.py index 2778a125f..94de1b428 100644 --- a/tests/ci/test_controller.py +++ b/tests/ci/test_controller.py @@ -328,8 +328,30 @@ class TestControllerIntegration: assert result.extracted_content is not None assert 'Waiting for' in result.extracted_content - # Verify that at least 1 second has passed - assert end_time - start_time >= 0.9 # Allow some timing margin + # Verify that less than 0.1 second has passed (because we deducted 3 seconds to account for the llm call) + assert end_time - start_time <= 0.1 # Allow some timing margin + + # longer wait + # Create wait action for 1 second - fix to use a dictionary + wait_action = {'wait': {'seconds': 5}} # Corrected format + + # Record start time + start_time = time.time() + + # Execute wait action + result = await controller.act(WaitActionModel(**wait_action), browser_session) + + # Record end time + end_time = time.time() + + # Verify the result + assert isinstance(result, ActionResult) + assert result.extracted_content is not None + assert 'Waiting for' in result.extracted_content + + # Verify that we took 2 sec (5s-3s (llm call)= 2s) + assert end_time - start_time <= 2.1 # Allow some timing margin + assert end_time - start_time >= 1.9 # Allow some timing margin async def test_go_back_action(self, controller, browser_session, base_url): """Test that go_back action navigates to the previous page."""